In [113]:
# The following European Classification of Individual Consumption according to Purpose (ECOICOP) data 
# used for this example is open-source, provided by Statistics Poland

# Link: https://github.com/UNECE/ML_dataset

In [114]:
import fasttext
import os
import pandas as pd
import pickle

In [115]:
### Define file locations ###
model_dir = "C:\\Users\\Justin Evans\\Documents\\Python\\UNECE\\Poland_FastText\\"
output_dir = ("C:\\Users\\Justin Evans\\Documents\\Python\\UNECE\\Poland_FastText\\")

# model confidence threshold to apply
threshold = 0.95


In [116]:
loaded_model = fasttext.load_model(model_dir+"model.bin")
test_data = os.path.join(os.getenv("DATADIR",""),model_dir+"test.txt")
overall_acc = loaded_model.test(test_data)[1]




In [117]:
loaded_model.predict("this is an example")

(('__label__032',), array([0.21888724]))

In [118]:
# load the test data for manual verification and to assess predictions
df = pd.read_csv(model_dir+"test.csv", encoding='UTF-8', dtype=str)
df.drop("Unnamed: 0",axis=1,inplace=True)

df["pred"] =""
df["score"] = ""
df["match"] = ""

for index,row in df.iterrows():
    pred = loaded_model.predict(row["formatted"])
    row["score"] = pred[1][0]
    row["pred"] = pred[0][0]
    row["pred"] = row["pred"].replace("__label__","")
    row["match"] = 1 if row["code"] ==  row["pred"] else 0
    
    
# add the code_key back
with open("code_key.txt", "rb") as file:
    code_dict = pickle.load(file)
    inv_map = {v: k for k, v in code_dict.items()} 
df['code_text_pred'] = df.pred.map(inv_map)
    
df.head()


Unnamed: 0,text,code_text,code,formatted,pred,score,match,code_text_pred
0,wloszczowski fromage cottage demi gras 400 g,Fromage et caillé,20,__label__020 wloszczowski fromage cottage demi...,20,0.999904,1,Fromage et caillé
1,dawn bakery mini baguette d&#39;environ 45 cm,pain,54,__label__054 dawn bakery mini baguette d&#39;e...,54,0.830544,1,pain
2,creme glacee a la fraise et au lait algida big...,crème glacée,53,__label__053 creme glacee a la fraise et au la...,53,0.999382,1,crème glacée
3,yaourt a la fraise skyr,yaourt,60,__label__060 yaourt a la fraise skyr,60,1.00001,1,yaourt
4,salami de fromage de serenade en tranches,Fromage et caillé,20,__label__020 salami de fromage de serenade en ...,20,0.999973,1,Fromage et caillé


In [119]:
# basic metrics for accuracy 
count = 0
matches = 0
for i,row in df.iterrows():
    if row["score"] <= threshold:
        continue
    matches += int(row["match"])
    count +=1

accuracy = round(float(matches/count)*100, 2)
codingrate = round(float(count/df.shape[0])*100, 2)
overall_acc = round(loaded_model.test(test_data)[1]*100, 2)

print("Overall Accuracy:",overall_acc)
print("Threshold Applied:", threshold)
print("Accuracy:",accuracy)
print("Coding Rate:",codingrate)

Overall Accuracy: 88.33
Threshold Applied: 0.95
Accuracy: 97.53
Coding Rate: 66.2


In [120]:
## Evaluate Model Performance ##

# import sklearn
from sklearn.metrics import classification_report
from pandas import DataFrame

In [121]:
# create lists to for evaluation metrics

# fix column types
df["pred"] = df["pred"].astype(str)
df["code_text_pred"] = df["code_text_pred"].astype(str)

# convert df to list
list_actual = df["code_text"].tolist()
list_predicted = df["code_text_pred"].tolist()

In [122]:
# classification report

report = classification_report(list_actual, list_predicted, output_dict=True)
df_class = pd.DataFrame(report).transpose()
df_class["class"] = df_class.index
df_class = df_class.reset_index(drop=True)
df_class.to_csv(output_dir + "classification_report.csv")

df_class.head(10)

Unnamed: 0,f1-score,precision,recall,support,class
0,0.8,0.666667,1.0,2.0,Autres graisses animales
1,0.971429,1.0,0.944444,18.0,Autres huiles comestibles
2,0.823529,0.875,0.777778,9.0,Autres légumes à légumes et préparations à bas...
3,0.865497,0.87574,0.855491,173.0,Autres produits alimentaires nca
4,0.585366,0.571429,0.6,20.0,Autres produits céréaliers
5,0.845771,0.825243,0.867347,196.0,Autres produits de boulangerie
6,0.884354,0.902778,0.866667,75.0,Autres produits laitiers
7,0.888889,0.88,0.897959,98.0,Autres préparations de poisson et de fruits de...
8,0.717949,0.682927,0.756757,37.0,Autres préparations de viande
9,0.888889,1.0,0.8,5.0,Autres viandes


In [123]:
# overall F1, precision, recall metrics are included in the classification report
f1_weighted_avg = df_class.iloc[-1,0]
precision_weighted_avg = df_class.iloc[-1,0]
recall_weighted_avg = df_class.iloc[-1,2]

f1_macro_avg = df_class.iloc[-2,0]
precision_macro_avg = df_class.iloc[-2,0]
recall_macro_avg = df_class.iloc[-2,2]

# create a df to show data
df_metrics = df_class.iloc[[-1,-2,-3]]
df_metrics.head()

Unnamed: 0,f1-score,precision,recall,support,class
63,0.88334,0.886184,0.883333,3420.0,weighted avg
62,0.840284,0.846101,0.848414,3420.0,macro avg
61,0.883333,0.883333,0.883333,3420.0,micro avg


In [124]:
# produce a report with model evaluation metrics
report = open(model_dir+"MODEL_METRICS.txt","w")
lines = ["Overall Accuracy: "+str(overall_acc)+"\n",  
        "Threshold Applied: "+str(threshold)+"\n", 
        "Accuracy: "+str(accuracy)+"\n", 
        "Coding Rate:"+str(codingrate)+"\n"+"\n", 
         
         
        'Type: '+str('Weighted Average')+", "+str('Macro Average')+"\n",
        'F1_score: '+str(f1_weighted_avg)+", "+str(f1_macro_avg)+"\n",         
        'Precision: '+str(precision_weighted_avg)+", "+str(precision_macro_avg)+"\n",      
        'Recall: '+str(recall_weighted_avg)+", "+str(recall_macro_avg)+"\n",      
        ]
report.writelines(lines) 
report.close()

In [125]:
print('done')

done
