## The following European Classification of Individual Consumption according to Purpose (ECOICOP) data used for this example is open-source, provided by Statistics Poland

#### Link: https://github.com/UNECE/ML_dataset

In [2]:
import pandas as pd
import numpy as np
import pickle
pd.options.mode.chained_assignment = None  # default='warn'

## Define folder locations

In [3]:
output_directory = ("C:\\Users\\Justin Evans\\Documents\\Python\\UNECE\\Poland_FastText\\")
data = pd.read_excel('https://raw.githubusercontent.com/UNECE/ML_dataset/master/Stats%20Poland%20ECOICOP%20data.xlsx', sheet_name = 'Data')

# if using a local copy
#data_directory = ("C:\\Users\\Justin Evans\\Documents\\Python\\UNECE\\Poland_FastText\\")
#data = pd.read_excel(data_dir+"Stats Poland ECOICOP data translated to English and French.xlsx")

In [30]:
# First we visualize the raw data
print("Dataset size: ",data.shape[0])
data.head()

Dataset size:  17099


Unnamed: 0,Desc_P,Code_P,Desc_E,Code_E,Desc_F,Code_F
0,owsianka brzoskwiniowa,Pozostałe produkty mleczne,Peach Porridge,Other dairy products,Bouillie de pêche,Autres produits laitiers
1,Owsianka Truskawkowa Mlekovita,Pozostałe produkty mleczne,Mlekovita Strawberry Porridge,Other dairy products,Bouillie de fraises Mlekovita,Autres produits laitiers
2,Owsianka wiśniowa,Pozostałe produkty mleczne,Cherry Porridge,Other dairy products,Bouillie de cerise,Autres produits laitiers
3,Owsianka Truskawkowa Mlekovita,Pozostałe produkty mleczne,Mlekovita Strawberry Porridge,Other dairy products,Bouillie de fraises Mlekovita,Autres produits laitiers
4,sałatka z batatów,Pozostałe warzywa bulwiaste i przetwory z warz...,sweet potato salad,Other tuber vegetables and preparations of tub...,salade de patates douces,Autres légumes à légumes et préparations à bas...


## Preprocess the data

In [31]:
# set the df - define our variables of interest
df = data[['Desc_F','Code_F']]

# rename columns for consistency 
df = df.rename(columns={'Desc_F': 'text', 'Code_F': 'code_text'})
df.head()

Unnamed: 0,text,code_text
0,Bouillie de pêche,Autres produits laitiers
1,Bouillie de fraises Mlekovita,Autres produits laitiers
2,Bouillie de cerise,Autres produits laitiers
3,Bouillie de fraises Mlekovita,Autres produits laitiers
4,salade de patates douces,Autres légumes à légumes et préparations à bas...


In [32]:
# create a numerical key 
code = df[['code_text']]
code_key = code.groupby(['code_text']).size().reset_index().
print("Unique codes: ", code_key.shape[0])
code_key['code'] = code_key.index + 1
code_key['code'] = code_key['code'].astype(str).str.zfill(3)
code_key.index = code_key.code_text
key_dict = pd.Series(code_key.code.values, index=code_key.code_text).to_dict()

# save numerical key to merge back onto predicted file
with open("code_key.txt", "wb") as file:
    pickle.dump(key_dict, file)

# merge numerical code onto original dataframe
df['code'] = df.code_text.map(code_key.code)
df.head()

Unique codes:  61


Unnamed: 0,text,code_text,code
0,Bouillie de pêche,Autres produits laitiers,7
1,Bouillie de fraises Mlekovita,Autres produits laitiers,7
2,Bouillie de cerise,Autres produits laitiers,7
3,Bouillie de fraises Mlekovita,Autres produits laitiers,7
4,salade de patates douces,Autres légumes à légumes et préparations à bas...,3


In [28]:
# remove accents from french data
import unicodedata

def strip_accents(text):
    text = unicodedata.normalize("NFD",str(text)).encode("ascii","ignore").decode("utf-8")
    return str(text)

print(strip_accents("this is àn èxamplé"))

# remove accents from text
df['text'] = df.apply(lambda row : strip_accents(row["text"]),axis = 1)


this is an example


In [17]:
# lowercase column
df['text'] = df['text'].str.lower()
    
# remove blank lines
import re
def removeblanklines(text):
    formatted_1 = re.sub("-", " ", text) # remove dashes
    formatted_2 = re.sub(r'\s{2}', " ", formatted_1) # remove double spaces
    return (formatted_2)

df["text"] = df.apply(lambda row: removeblanklines(row["text"]), axis=1)
df.head()

Unnamed: 0,text,code_text,code
0,bouillie de peche,Autres produits laitiers,7
1,bouillie de fraises mlekovita,Autres produits laitiers,7
2,bouillie de cerise,Autres produits laitiers,7
3,bouillie de fraises mlekovita,Autres produits laitiers,7
4,salade de patates douces,Autres légumes à légumes et préparations à bas...,3


In [9]:
# a caesar cipher used to combine multiple variables

def cipher(sentence,shift_value):
    translated = "" #new empty string of soon to be transformed string
    
    if sentence == "":
        return ""
    
    for char in sentence: 
        if char.isalpha():
            ascii_num = ord(char) #get the ascii rep of the character
            ascii_num += shift_value # add the shifted value
            
            #handle index rapping
            if char.isupper():
                if ascii_num > ord("Z"):
                    ascii_num -= 26
                elif ascii_num < ord("A"):
                    ascii_num += 26
            elif char.islower():
                if ascii_num > ord("z"):
                    ascii_num -= 26
                elif ascii_num < ord("a"):
                    ascii_num += 26
            translated += chr(ascii_num)
        else:
            translated += char
    return str(translated)

print(cipher("this is an example",2))

# example below used to cipher and concatenate multiple variables

#df["text"] = df.apply(lambda row:  cipher(row["name_business"],1) + " "  
#                             +cipher(row["kind_business"],2) +" "+ 
#                                cipher(row["kind_work"],3) +" "+ cipher(row["main_activities"],0),axis=1)

vjku ku cp gzcorng


In [10]:
# format the data so it is consumable by fasttext
df["formatted"] = df.apply(lambda row: str('__label__'+row['code']+' '+row['text']) ,axis=1)
df["formatted"].head(10)

0                       __label__007 bouillie de peche
1           __label__007 bouillie de fraises mlekovita
2                      __label__007 bouillie de cerise
3           __label__007 bouillie de fraises mlekovita
4                __label__003 salade de patates douces
5    __label__055 salade de pommes de terre aux bet...
6    __label__055 salade de pommes de terre cuisine...
7                __label__003 salade de patates douces
8    __label__055 salade de pommes de terre aux bet...
9    __label__055 salade de pommes de terre cuisine...
Name: formatted, dtype: object

## Split the processed data into train and test datasets

In [11]:
# create a training and testing dataset
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2, stratify=None) 

# create a set of txt files for ML, csv for analysis
train_formatted = train[["formatted"]]
test_formatted = test[["formatted"]]

print("Train Dataset Size: ", train.shape[0])
print("Test Dataset Size: ", test.shape[0])

13679
3420


## Export preprocessed files

In [12]:
# export training and testing sets
np.savetxt(output_directory+"train.txt",train_formatted.values,fmt="%s") # fasttext will read this txt file
test.to_csv(output_directory+"test.csv") # used for analysis in step 4


In [33]:
print("done!")

done!
