# Categorize on abbreviation embeddings with xgboost trained on full name embeddings

In [89]:
import pickle
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
import ast 

from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')


In [90]:
with open("../models/xbg_model.pkl", "rb") as f:
    model_xgb = pickle.load(f)


In [91]:
df = pd.read_csv('../data/abbr_embeds_no_context.csv', index_col=0)

In [92]:
df

Unnamed: 0,receipt,product_abbr,price,embeddings
0,1,VOLLKORNTORTILLA,2.58,"[-0.04034423828125, -0.005352020263671875, 0.0..."
1,1,TOFU NATUR,4.38,"[-0.0147705078125, -0.0022411346435546875, 0.0..."
2,1,HAUCHSCHN CURRY,3.18,"[-0.0377197265625, -0.004734039306640625, 0.06..."
3,1,HONEYPOMELO PINK,2.99,"[-0.00493621826171875, -0.0181884765625, 0.060..."
4,1,DINKEL-CRUNCHY,2.59,"[0.01094818115234375, -0.00835418701171875, 0...."
...,...,...,...,...
138,15,WERNESGR. ALKFR,1.99,"[-0.0025997161865234375, 0.0118255615234375, 0..."
139,15,WERNESGR. ALKFR,11.40,"[-0.0025997161865234375, 0.0118255615234375, 0..."
140,15,"PFAND 0,08 EUR",0.96,"[-0.00926971435546875, -0.006359100341796875, ..."
141,15,MUELLSACK 60L,3.99,"[-0.0213165283203125, 0.000835418701171875, 0...."


In [93]:
df_coded = pd.read_csv('../data/abbr_coded.csv', index_col=0, sep=';')

In [94]:
df_coded

Unnamed: 0,prod_id,match,name,category,prod_similarity,price_web
0,15363,1,Palapa Vollkorntortilla 370g,Süßes & Salziges,0.838345,2.29
1,5603,1,Berief Bio Tofu Natur 2x200g,Fleisch & Fisch,0.855739,2.49
2,6099,1,Rügenwalder Mühle Hauchschnitt Curry Typ Hähnc...,Fleisch & Fisch,0.809469,1.59
3,13853,1,Honey Pomelo Pink 1 Stück,Obst & Gemüse,0.886331,2.99
4,1345,1,REWE Bio Naturland Dinkel Crunchy 500g,"Brot, Cerealien & Aufstriche",0.832309,2.59
...,...,...,...,...,...,...
138,14091,0,Arko Erdnusstaler 200g,Süßes & Salziges,0.754391,2.99
139,14091,0,Arko Erdnusstaler 200g,Süßes & Salziges,0.754391,2.99
140,8181,9,"Pfanner Eistee Pfirsich 0,33l",Getränke & Genussmittel,0.784003,1.29
141,13734,1,ja! Müllsäcke 60l mit Zugbeutel 25 Stück,Küche & Haushalt,0.821446,2.49


In [95]:
df_abbr_coded = df.join(df_coded)

In [96]:
df_abbr_coded

Unnamed: 0,receipt,product_abbr,price,embeddings,prod_id,match,name,category,prod_similarity,price_web
0,1,VOLLKORNTORTILLA,2.58,"[-0.04034423828125, -0.005352020263671875, 0.0...",15363,1,Palapa Vollkorntortilla 370g,Süßes & Salziges,0.838345,2.29
1,1,TOFU NATUR,4.38,"[-0.0147705078125, -0.0022411346435546875, 0.0...",5603,1,Berief Bio Tofu Natur 2x200g,Fleisch & Fisch,0.855739,2.49
2,1,HAUCHSCHN CURRY,3.18,"[-0.0377197265625, -0.004734039306640625, 0.06...",6099,1,Rügenwalder Mühle Hauchschnitt Curry Typ Hähnc...,Fleisch & Fisch,0.809469,1.59
3,1,HONEYPOMELO PINK,2.99,"[-0.00493621826171875, -0.0181884765625, 0.060...",13853,1,Honey Pomelo Pink 1 Stück,Obst & Gemüse,0.886331,2.99
4,1,DINKEL-CRUNCHY,2.59,"[0.01094818115234375, -0.00835418701171875, 0....",1345,1,REWE Bio Naturland Dinkel Crunchy 500g,"Brot, Cerealien & Aufstriche",0.832309,2.59
...,...,...,...,...,...,...,...,...,...,...
138,15,WERNESGR. ALKFR,1.99,"[-0.0025997161865234375, 0.0118255615234375, 0...",14091,0,Arko Erdnusstaler 200g,Süßes & Salziges,0.754391,2.99
139,15,WERNESGR. ALKFR,11.40,"[-0.0025997161865234375, 0.0118255615234375, 0...",14091,0,Arko Erdnusstaler 200g,Süßes & Salziges,0.754391,2.99
140,15,"PFAND 0,08 EUR",0.96,"[-0.00926971435546875, -0.006359100341796875, ...",8181,9,"Pfanner Eistee Pfirsich 0,33l",Getränke & Genussmittel,0.784003,1.29
141,15,MUELLSACK 60L,3.99,"[-0.0213165283203125, 0.000835418701171875, 0....",13734,1,ja! Müllsäcke 60l mit Zugbeutel 25 Stück,Küche & Haushalt,0.821446,2.49


In [97]:
df_filtered = df_abbr_coded[df_abbr_coded.eval('match != 9')]

In [98]:
df_filtered

Unnamed: 0,receipt,product_abbr,price,embeddings,prod_id,match,name,category,prod_similarity,price_web
0,1,VOLLKORNTORTILLA,2.58,"[-0.04034423828125, -0.005352020263671875, 0.0...",15363,1,Palapa Vollkorntortilla 370g,Süßes & Salziges,0.838345,2.29
1,1,TOFU NATUR,4.38,"[-0.0147705078125, -0.0022411346435546875, 0.0...",5603,1,Berief Bio Tofu Natur 2x200g,Fleisch & Fisch,0.855739,2.49
2,1,HAUCHSCHN CURRY,3.18,"[-0.0377197265625, -0.004734039306640625, 0.06...",6099,1,Rügenwalder Mühle Hauchschnitt Curry Typ Hähnc...,Fleisch & Fisch,0.809469,1.59
3,1,HONEYPOMELO PINK,2.99,"[-0.00493621826171875, -0.0181884765625, 0.060...",13853,1,Honey Pomelo Pink 1 Stück,Obst & Gemüse,0.886331,2.99
4,1,DINKEL-CRUNCHY,2.59,"[0.01094818115234375, -0.00835418701171875, 0....",1345,1,REWE Bio Naturland Dinkel Crunchy 500g,"Brot, Cerealien & Aufstriche",0.832309,2.59
...,...,...,...,...,...,...,...,...,...,...
136,15,ZIMTSTERNE DOPPELKEKS GF,4.49,"[-0.044281005859375, -0.055023193359375, 0.066...",15975,1,ja! Doppelkeks 500g,Süßes & Salziges,0.819534,1.69
137,15,LAYS SALT&VINEG.,2.99,"[-0.0172882080078125, 0.0069732666015625, 0.03...",14791,1,Lay's Salt & Vinegar 150g,Süßes & Salziges,0.893478,1.99
138,15,WERNESGR. ALKFR,1.99,"[-0.0025997161865234375, 0.0118255615234375, 0...",14091,0,Arko Erdnusstaler 200g,Süßes & Salziges,0.754391,2.99
139,15,WERNESGR. ALKFR,11.40,"[-0.0025997161865234375, 0.0118255615234375, 0...",14091,0,Arko Erdnusstaler 200g,Süßes & Salziges,0.754391,2.99


In [99]:
cat_list = ['Babybedarf',
    'Brot, Cerealien & Aufstriche',
    'Drogerie & Kosmetik',
    'Fertiggerichte & Konserven',
    'Fleisch & Fisch',
    'Getränke & Genussmittel',
    'Haus & Freizeit',
    'Kaffee, Tee & Kakao',
    'Kochen & Backen',
    'Käse, Eier & Molkerei',
    'Küche & Haushalt',
    'Obst & Gemüse',
    'Süßes & Salziges',
    'Tiefkühlkost',
    'Tierbedarf',
    'Öle, Soßen & Gewürze']
cat_list

['Babybedarf',
 'Brot, Cerealien & Aufstriche',
 'Drogerie & Kosmetik',
 'Fertiggerichte & Konserven',
 'Fleisch & Fisch',
 'Getränke & Genussmittel',
 'Haus & Freizeit',
 'Kaffee, Tee & Kakao',
 'Kochen & Backen',
 'Käse, Eier & Molkerei',
 'Küche & Haushalt',
 'Obst & Gemüse',
 'Süßes & Salziges',
 'Tiefkühlkost',
 'Tierbedarf',
 'Öle, Soßen & Gewürze']

In [100]:
cat_dict = dict()

for i, c in enumerate(cat_list):
    cat_dict[c] = i
#df_filtered['category'] = pd.Categorical(df_filtered.category)
#df_filtered['num_category'] = df_filtered.category.cat.codes

In [101]:
cat_dict

{'Babybedarf': 0,
 'Brot, Cerealien & Aufstriche': 1,
 'Drogerie & Kosmetik': 2,
 'Fertiggerichte & Konserven': 3,
 'Fleisch & Fisch': 4,
 'Getränke & Genussmittel': 5,
 'Haus & Freizeit': 6,
 'Kaffee, Tee & Kakao': 7,
 'Kochen & Backen': 8,
 'Käse, Eier & Molkerei': 9,
 'Küche & Haushalt': 10,
 'Obst & Gemüse': 11,
 'Süßes & Salziges': 12,
 'Tiefkühlkost': 13,
 'Tierbedarf': 14,
 'Öle, Soßen & Gewürze': 15}

In [102]:
df_filtered['category_num'] = df_filtered.category.map(cat_dict)

In [103]:
df_filtered

Unnamed: 0,receipt,product_abbr,price,embeddings,prod_id,match,name,category,prod_similarity,price_web,category_num
0,1,VOLLKORNTORTILLA,2.58,"[-0.04034423828125, -0.005352020263671875, 0.0...",15363,1,Palapa Vollkorntortilla 370g,Süßes & Salziges,0.838345,2.29,12
1,1,TOFU NATUR,4.38,"[-0.0147705078125, -0.0022411346435546875, 0.0...",5603,1,Berief Bio Tofu Natur 2x200g,Fleisch & Fisch,0.855739,2.49,4
2,1,HAUCHSCHN CURRY,3.18,"[-0.0377197265625, -0.004734039306640625, 0.06...",6099,1,Rügenwalder Mühle Hauchschnitt Curry Typ Hähnc...,Fleisch & Fisch,0.809469,1.59,4
3,1,HONEYPOMELO PINK,2.99,"[-0.00493621826171875, -0.0181884765625, 0.060...",13853,1,Honey Pomelo Pink 1 Stück,Obst & Gemüse,0.886331,2.99,11
4,1,DINKEL-CRUNCHY,2.59,"[0.01094818115234375, -0.00835418701171875, 0....",1345,1,REWE Bio Naturland Dinkel Crunchy 500g,"Brot, Cerealien & Aufstriche",0.832309,2.59,1
...,...,...,...,...,...,...,...,...,...,...,...
136,15,ZIMTSTERNE DOPPELKEKS GF,4.49,"[-0.044281005859375, -0.055023193359375, 0.066...",15975,1,ja! Doppelkeks 500g,Süßes & Salziges,0.819534,1.69,12
137,15,LAYS SALT&VINEG.,2.99,"[-0.0172882080078125, 0.0069732666015625, 0.03...",14791,1,Lay's Salt & Vinegar 150g,Süßes & Salziges,0.893478,1.99,12
138,15,WERNESGR. ALKFR,1.99,"[-0.0025997161865234375, 0.0118255615234375, 0...",14091,0,Arko Erdnusstaler 200g,Süßes & Salziges,0.754391,2.99,12
139,15,WERNESGR. ALKFR,11.40,"[-0.0025997161865234375, 0.0118255615234375, 0...",14091,0,Arko Erdnusstaler 200g,Süßes & Salziges,0.754391,2.99,12


In [107]:
embeddings = df_filtered.embeddings.tolist()
embeddings_list = [ast.literal_eval(s) for s in embeddings]
X = np.array(embeddings_list) 
y_xgb = df_filtered.category_num

In [111]:
y_pred_xgb = model_xgb.predict(X)

In [112]:
print(classification_report(y_xgb, y_pred_xgb))

              precision    recall  f1-score   support

           1       0.80      0.75      0.77        16
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         6
           4       0.86      0.40      0.55        15
           5       0.41      0.69      0.51        13
           6       0.00      0.00      0.00         4
           7       0.00      0.00      0.00         1
           8       0.50      0.29      0.36         7
           9       0.86      0.55      0.67        11
          10       0.10      0.67      0.17         3
          11       1.00      0.20      0.33        25
          12       0.20      0.12      0.15         8
          13       0.20      1.00      0.33         1
          14       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         1

    accuracy                           0.39       112
   macro avg       0.33      0.31      0.26       112
weighted avg       0.63   

### Test logistic regression classifier model

In [44]:
with open("../models/LogReg_model.pkl", "rb") as f:
    model_lr = pickle.load(f)


In [47]:
y_lr = df_filtered.category

In [45]:
y_pred_lr = model_lr.predict(X)

In [48]:
print(classification_report(y_lr, y_pred_lr))

                              precision    recall  f1-score   support

Brot, Cerealien & Aufstriche       0.62      0.94      0.75        16
         Drogerie & Kosmetik       0.00      0.00      0.00         0
  Fertiggerichte & Konserven       0.00      0.00      0.00         6
             Fleisch & Fisch       0.73      0.53      0.62        15
     Getränke & Genussmittel       0.53      0.77      0.62        13
             Haus & Freizeit       0.00      0.00      0.00         4
         Kaffee, Tee & Kakao       0.00      0.00      0.00         1
             Kochen & Backen       1.00      0.29      0.44         7
       Käse, Eier & Molkerei       1.00      0.36      0.53        11
            Küche & Haushalt       0.07      0.67      0.12         3
               Obst & Gemüse       1.00      0.08      0.15        25
            Süßes & Salziges       0.67      0.25      0.36         8
                Tiefkühlkost       0.33      1.00      0.50         1
                  T