In [None]:
# The following ECOICOP data used for this example is open-source, provided by Statistics Poland

# Link: https://statswiki.unece.org/download/attachments/256969394/Stats%20Poland%20ECOICOP%20data%20translated%20to%20English%20and%20French.xlsx?version=1&modificationDate=1570023568166&api=v2

In [1]:
import pandas as pd
import numpy as np

In [2]:
### Define file locations ###

data_dir = ("Z:\\Team_Folders\\Evans\\python_scripts\\HLG_MOS\\Poland_FastText\\Data\\")
data = pd.read_excel(data_dir+"Stats_Poland_ECOICOP.xlsx")
output_dir = ("Z:\\Team_Folders\\Evans\\python_scripts\\HLG_MOS\\Poland_FastText\\Data\\French\\")

In [3]:
print(data.shape[0])
data.head()

17099


Unnamed: 0,Desc_P,Code_P,Desc_E,Code_E,Desc_F,Code_F
0,owsianka brzoskwiniowa,Pozostałe produkty mleczne,Peach Porridge,Other dairy products,Bouillie de pêche,Autres produits laitiers
1,Owsianka Truskawkowa Mlekovita,Pozostałe produkty mleczne,Mlekovita Strawberry Porridge,Other dairy products,Bouillie de fraises Mlekovita,Autres produits laitiers
2,Owsianka wiśniowa,Pozostałe produkty mleczne,Cherry Porridge,Other dairy products,Bouillie de cerise,Autres produits laitiers
3,Owsianka Truskawkowa Mlekovita,Pozostałe produkty mleczne,Mlekovita Strawberry Porridge,Other dairy products,Bouillie de fraises Mlekovita,Autres produits laitiers
4,sałatka z batatów,Pozostałe warzywa bulwiaste i przetwory z warz...,sweet potato salad,Other tuber vegetables and preparations of tub...,salade de patates douces,Autres légumes à légumes et préparations à bas...


In [4]:
# set the df
df = data[['Desc_F','Code_F']]

# rename columns
df.columns = ['text','code_text']
print(df.shape[0])
df.head()

17099


Unnamed: 0,text,code_text
0,Bouillie de pêche,Autres produits laitiers
1,Bouillie de fraises Mlekovita,Autres produits laitiers
2,Bouillie de cerise,Autres produits laitiers
3,Bouillie de fraises Mlekovita,Autres produits laitiers
4,salade de patates douces,Autres légumes à légumes et préparations à bas...


In [5]:
# create a numerical key 
code = df[['code_text']]
code_key = code.groupby(['code_text']).size().reset_index()
code_key['code'] = code_key.index + 1
code_key['code'] = code_key['code'].astype(str).str.zfill(3)
code_key.index = code_key.code_text

# merge numerical code onto original dataframe
df['code'] = df.code_text.map(code_key.code)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0,text,code_text,code
0,Bouillie de pêche,Autres produits laitiers,7
1,Bouillie de fraises Mlekovita,Autres produits laitiers,7
2,Bouillie de cerise,Autres produits laitiers,7
3,Bouillie de fraises Mlekovita,Autres produits laitiers,7
4,salade de patates douces,Autres légumes à légumes et préparations à bas...,3


In [6]:
# remove accents from french data
import unicodedata

def strip_accents(text):
    text = unicodedata.normalize("NFD",text).encode("ascii","ignore").decode("utf-8")
    return str(text)

print(strip_accents("this is àn èxamplé"))

# remove accents from text
df['text'] = df['text'].astype(str)
df['text'] = df.apply(lambda row : strip_accents(row["text"]),axis = 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


this is an example


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [10]:
# lowercase column
for col in ["text"]:
    df[col] = df[col].apply(lambda x: x.lower())
    
# remove blank lines
import re
def removeblanklines(text):
    formatted_1 = re.sub("-", " ", text) # remove dashes
    formatted_2 = re.sub(r'\s{2}', "", formatted_1)
    return (formatted_2)

df["text"] = df.apply(lambda row: removeblanklines(row["text"]), axis=1)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Unnamed: 0,text,code_text,code
0,bouillie de peche,Autres produits laitiers,7
1,bouillie de fraises mlekovita,Autres produits laitiers,7
2,bouillie de cerise,Autres produits laitiers,7
3,bouillie de fraises mlekovita,Autres produits laitiers,7
4,salade de patates douces,Autres légumes à légumes et préparations à bas...,3


In [31]:
# a caesar cipher used to combine multiple variables

def cipher(sentence,shift_value):
    translated = "" #new empty string of soon to be transformed string
    
    if sentence == "":
        return ""
    
    for char in sentence: 
        if char.isalpha():
            ascii_num = ord(char) #get the ascii rep of the character
            ascii_num += shift_value # add the shifted value
            
            #handle index rapping
            if char.isupper():
                if ascii_num > ord("Z"):
                    ascii_num -= 26
                elif ascii_num < ord("A"):
                    ascii_num += 26
            elif char.islower():
                if ascii_num > ord("z"):
                    ascii_num -= 26
                elif ascii_num < ord("a"):
                    ascii_num += 26
            translated += chr(ascii_num)
        else:
            translated += char
    return str(translated)

print(cipher("this is an example",2))

# example below used to cipher and concatenate multiple variables

#df["text"] = df.apply(lambda row:  cipher(row["name_business"],1) + " "  
#                             +cipher(row["kind_business"],2) +" "+ 
#                                cipher(row["kind_work"],3) +" "+ cipher(row["main_activities"],0),axis=1)

vjku ku cp gzcorng


In [19]:
# format the data so it is consumable by fasttext
df["formatted"] = df.apply(lambda row: str('__label__'+row['code']+' '+row['text']) ,axis=1)
df["formatted"].head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0                       __label__007 Bouillie de peche
1           __label__007 Bouillie de fraises Mlekovita
2                      __label__007 Bouillie de cerise
3           __label__007 Bouillie de fraises Mlekovita
4                __label__003 salade de patates douces
5    __label__055 salade de pommes de terre aux bet...
6    __label__055 salade de pommes de terre cuisine...
7                __label__003 salade de patates douces
8    __label__055 salade de pommes de terre aux bet...
9    __label__055 salade de pommes de terre cuisine...
Name: formatted, dtype: object

In [20]:
# create a training and testing dataset
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2, stratify=None) 

# create a set of txt files for ML, csv for analysis
train_formatted = train[["formatted"]]
test_formatted = test[["formatted"]]

print(train.shape[0])
print(test.shape[0])

13679
3420


In [29]:
# export training and testing sets
np.savetxt(output_dir+"train.txt",train_formatted.values,fmt="%s")
np.savetxt(output_dir+"test.txt",test_formatted.values,fmt="%s")

train.to_csv(output_dir+"train.csv")
test.to_csv(output_dir+"test.csv")
