In [None]:
# The following ECOICOP data used for this example is open-source, provided by Statistics Poland

# Link: https://statswiki.unece.org/download/attachments/256969394/Stats%20Poland%20ECOICOP%20data%20translated%20to%20English%20and%20French.xlsx?version=1&modificationDate=1570023568166&api=v2

In [22]:
import fasttext
import os
import pandas as pd

In [23]:
### Define file locations ###
model_dir = "Z:\\Team_Folders\\Evans\\python_scripts\\HLG_MOS\\Poland_FastText\\Data\\French\\"
output_dir = ("Z:\\Team_Folders\\Evans\\python_scripts\\HLG_MOS\\Poland_FastText\\Data\\French\\")

# model confidence threshold to apply
threshold = 0.95


In [24]:
loaded_model = fasttext.load_model(model_dir+"model.bin")
test_data = os.path.join(os.getenv("DATADIR",""),model_dir+"test.txt")
overall_acc = loaded_model.test(test_data)[1]




In [25]:
loaded_model.predict("this is an example")

(('__label__024',), array([0.29182354]))

In [26]:
# load the test data for manual verification and to assess predictions
df = pd.read_csv(model_dir+"test.csv", encoding='UTF-8', dtype=str)
df.drop("Unnamed: 0",axis=1,inplace=True)

df["pred"] =""
df["score"] = ""
df["match"] = ""

for index,row in df.iterrows():
    pred = loaded_model.predict(row["formatted"])
    row["score"] = pred[1][0]
    row["pred"] = pred[0][0]
    row["pred"] = row["pred"].replace("__label__","")
    row["match"] = 1 if row["code"] ==  row["pred"] else 0
df.head()


Unnamed: 0,text,code_text,code,formatted,pred,score,match
0,- Jus de clementine 100%,Jus de fruits et de légumes,27,__label__027 - Jus de clementine 100%,27,1.00001,1
1,"7UP - Boisson gazeuse citron-lime, 2 x 2 ...",Boissons rafraîchissantes,12,__label__012 7UP - Boisson gazeuse citron-lime...,12,0.998325,1
2,Bigos du chasseur cuisine polonaise,Plats cuisinés,37,__label__037 Bigos du chasseur cuisine polonaise,37,0.972294,1
3,Mimolle a la Francaise affinee en tranches,Fromage et caillé,20,__label__020 Mimolle a la Francaise affinee en...,20,0.992604,1
4,Pringles - Frites au fromage et a l&#39;oignon,chips,51,__label__051 Pringles - Frites au fromage et a...,51,0.610564,1


In [27]:
# basic metrics for accuracy 
count = 0
matches = 0
for i,row in df.iterrows():
    if row["score"] <= threshold:
        continue
    matches += int(row["match"])
    count +=1

accuracy = round(float(matches/count)*100, 2)
codingrate = round(float(count/df.shape[0])*100, 2)
overall_acc = round(loaded_model.test(test_data)[1]*100, 2)

print("Overall Accuracy:",overall_acc)
print("Threshold Applied:", threshold)
print("Accuracy:",accuracy)
print("Coding Rate:",codingrate)

Overall Accuracy: 96.73
Threshold Applied: 0.95
Accuracy: 99.45
Coding Rate: 90.06


In [28]:
## Evaluate Model Performance ##

# import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from pandas import DataFrame

In [29]:
# fix column types
df["pred"] = df["pred"].astype(str)
df["code"] = df["code"].astype(str)

# convert df to list
list_actual = df["code"].tolist()
list_predicted = df["pred"].tolist()

# evaluation metrics
weighted_f1 = round(f1_score(list_actual, list_predicted, average = "weighted")*100, 2)
macro_f1 = round(f1_score(list_actual, list_predicted, average = "macro")*100, 2)
weighted_precision = round(precision_score(list_actual, list_predicted, average = "weighted")*100, 2)
macro_precision = round(precision_score(list_actual, list_predicted, average = "macro")*100, 2)
weighted_recall = round(recall_score(list_actual, list_predicted, average = "weighted")*100, 2)
macro_recall = round(recall_score(list_actual, list_predicted, average = "macro")*100, 2)

# create a df to show data
metrics = {'Type': ['Weighted Average', 'Macro Average'],
              'F1_score': [weighted_f1, macro_f1],
            'Precision': [weighted_precision, macro_precision],
             'Recall': [weighted_recall, macro_recall]}
df_metrics = DataFrame(metrics, columns = ['Type','F1_score', 'Precision', 'Recall'])

df_metrics['F1_score'] = df_metrics.apply(lambda row : (round(row["F1_score"], 1)),axis = 1)
df_metrics['Precision'] = df_metrics.apply(lambda row : (round(row["Precision"], 1)),axis = 1)
df_metrics['Recall'] = df_metrics.apply(lambda row : (round(row["Recall"], 1)),axis = 1)
df_metrics.head()

Unnamed: 0,Type,F1_score,Precision,Recall
0,Weighted Average,96.7,96.8,96.7
1,Macro Average,95.6,96.1,95.4


In [30]:
# produce a report with model evaluation metrics
report = open(model_dir+"MODEL_METRICS.txt","w")
lines = ["Overall Accuracy: "+str(overall_acc)+"\n",  
        "Threshold Applied: "+str(threshold)+"\n", 
        "Accuracy: "+str(accuracy)+"\n", 
        "Coding Rate:"+str(codingrate)+"\n"+"\n", 
         
         
        'Type: '+str('Weighted Average')+", "+str('Macro Average')+"\n",
        'F1_score: '+str(weighted_f1)+", "+str(macro_f1)+"\n",         
        'Precision: '+str(weighted_precision)+", "+str(macro_precision)+"\n",      
        'Recall: '+str(weighted_recall)+", "+str(macro_recall)+"\n",      
        ]
report.writelines(lines) 
report.close()

In [31]:
# classification report

# create a report - stackoverflow 39662398
report = []

def classification_report_csv(report):
    report_data = []
    lines = report.split('\n')
    for line in lines[2:-3]:
        row = {}
        row_data = line.split()
        row['class'] = row_data[0]
        row['precision'] = float(row_data[1])
        row['recall'] = float(row_data[2])
        row['f1_score'] = float(row_data[3])
        row['support'] = float(row_data[4])
        report_data.append(row)
    dataframe = pd.DataFrame.from_dict(report_data)
    dataframe.to_csv(output_dir+'test_classification_report.csv', index = False)

report = classification_report(list_actual, list_predicted)
classification_report_csv(report)