# Apply fedas_classifier

This notebook allows to train a fedas FedasClassifier on a given dataset and use it to make predictions on a new dataset.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from fedas_classifier import FedasClassifier

## Train classifier

In [2]:
fc = FedasClassifier()

raw_train = pd.read_csv('data_technical_test/train_technical_test.csv', 
    na_values="",
    dtype={
        "incorrect_fedas_code": object, 
        "correct_fedas_code": object, 
    },
    parse_dates=["avalability_start_date", "avalability_end_date"])

train, val = train_test_split(raw_train, test_size=0.1, random_state=42)

train_features = train.copy(deep=True).drop(columns=["correct_fedas_code"])
train_target = train["correct_fedas_code"]

val_features = val.copy(deep=True).drop(columns=["correct_fedas_code"])
val_target = val["correct_fedas_code"]

fc.fit(train_features, train_target)

Normalizing features...
Normalizing target...
Fitting model...
Done.


Get prediction on validation set:

In [3]:
predictions = fc.predict(val_features)

Normalizing features...
Predicting fedas codes...


In [47]:
predictions

Unnamed: 0,prediction,confidence
16818,375023,0.371573
31197,275129,0.131546
30098,264717,0.104881
10591,175844,0.748200
18111,175907,1.000000
...,...,...
33253,178990,0.902400
12255,246398,0.145585
2437,103756,0.827022
9556,275129,0.131546


We create a dataframe with the predictions and the true values to compare them.

In [81]:
predictions.index = val.index
predictions = predictions.rename(columns={'fedas': 'prediction'})

comparison = pd.concat(
    (predictions,
    pd.DataFrame(val_target, columns=['correct_fedas_code'])),
         axis=1)
comparison['is_correct'] = comparison['prediction'] == comparison['correct_fedas_code']
comparison

Unnamed: 0,prediction,confidence,correct_fedas_code,is_correct
16818,375023,0.371573,375024,False
31197,275129,0.131546,275443,False
30098,264717,0.104881,264297,False
10591,175844,0.748200,175844,True
18111,175907,1.000000,175907,True
...,...,...,...,...
33253,178990,0.902400,178990,True
12255,246398,0.145585,246398,True
2437,103756,0.827022,103756,True
9556,275129,0.131546,275479,False


Now, among the wrong result, we separate those with **high confidence score** (> 0.5) and **low confidence score** (<= 0.5), and for each of them, we display the raw features and normalized features.

In [88]:
normalized_val_features = pd.DataFrame(fc._normalize_features(val_features), columns=['normalized'])

Normalizing features...


In [95]:
wrong_high_confidence = comparison[(comparison['is_correct'] == False) & (comparison['confidence'] > 0.5)]
wrong_high_with_features = pd.concat(
    (wrong_high_confidence, 
    val_features.loc[wrong_high_confidence.index],
    normalized_val_features.loc[wrong_high_confidence.index]), 
        axis=1).drop(columns=['is_correct'])

wrong_low_confidence = comparison[(comparison['is_correct'] == False) & (comparison['confidence'] <= 0.5)]
wrong_low_with_features = pd.concat(
    (wrong_low_confidence, 
    val_features.loc[wrong_low_confidence.index, fc.columns],
    normalized_val_features.loc[wrong_low_confidence.index]), 
        axis=1).drop(columns=['is_correct'])

In [96]:
wrong_high_with_features

Unnamed: 0,prediction,confidence,correct_fedas_code,brand,model_code,model_label,commercial_label,incorrect_fedas_code,article_main_category,article_type,...,eco_participation,eco_furniture,multiple_of_order,minimum_multiple_of_order,net_weight,raw_weight,volume,size,accurate_gender,normalized
15337,233914,0.500287,234911,brand_1,FK1005,MT MVB SHORT,,234511,TEAM SPORTS,HOMME,...,0.0,0.0,1,0,0.00,0.0,0.0,2XL,HO,brand_1 team sports homme shorts 2xl ho
18933,375958,0.875760,375022,brand_113,1011258,TOWN CLASSIC WMN,,375312,BIANCO,CHAUSSURE,...,0.0,0.0,8,1,0.00,0.0,0.0,36,FE,brand_113 bianco chaussure tige basse 36 fe
7861,364252,0.520522,364265,brand_73,1938911,TRAILSTORM WATERPROOF,,364312,OUTDOOR,FEMME,...,0.0,0.0,0,0,0.00,0.0,0.0,10,FE,brand_73 outdoor femme trailstorm waterpr 10 fe
18867,131560,0.500147,131183,brand_389,21004,PLAQUETTE RECTO/VERSO VOLLEYBALL,,100995,MULTISPORT,UNISEXE ADULTE,...,0.0,0.0,0,0,0.00,0.0,0.0,TU,UN,brand_389 multisport unisexe adulte accessoire...
12509,375957,0.799567,375851,brand_191,2020293,VERDON PLUS,,375311,1 FOOTWEAR,UNISEX,...,0.0,0.0,0,0,0.00,0.0,0.0,36,HO,brand_191 footwear unisex shoes 36 ho
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6703,275126,1.000000,275129,brand_56,IB0IB00605,REFLECTIVE LINES LOG,,,APPAREL,T-SHIRTS,...,0.0,0.0,1,1,0.15,0.0,0.0,10,GA,brand_56 apparel t shirts l s t shirts 10 ga
4619,375959,0.532616,375953,brand_243,CN8556,NIKE MD VALIANT LITTLE KIDS' SHOE,,375983,SPORTSTYLE,GARCON,...,0.0,0.0,1,0,0.00,0.0,0.0,1,GA,brand_243 sportstyle garcon low top 1 ga
36135,235287,0.566450,235377,brand_317,6228104333,ALL-WEATHER JACKET ARGENTINA,,235287,TEXTILE,VESTE,...,0.0,0.0,1,1,0.40,0.0,0.0,XL,HO,brand_317 textile veste xl ho
16443,200334,0.590764,278124,brand_293,FU3197,TE LINEAR LOGO TS,,278337,TRAINING,SURVETEMENT,...,0.0,0.0,8,0,0.00,0.0,0.0,2XL,HO,brand_293 training survetement adult male 2xl ho


In [97]:
wrong_low_with_features

Unnamed: 0,prediction,confidence,correct_fedas_code,brand,article_main_category,article_type,article_detail,comment,size,accurate_gender,normalized
16818,375023,0.371573,375024,brand_191,COURTKIDS,UNISEX PRESCHOOL,SHOES,,28,GA,brand_191 courtkids unisex preschool shoes 28 ga
31197,275129,0.131546,275443,brand_158,LOISIRS,VETEMENTS,,,128,GA,brand_158 loisirs vetements 128 ga
30098,264717,0.104881,264297,brand_41,MENSSPRTWR,MSPRTWRTOP,MSWTSHTHD,,L,HO,brand_41 menssprtwr msprtwrtop mswtshthd l ho
198,346062,0.230487,346082,brand_240,RUNNING,RUNNING LACET,FITNESS RUNNING CHAUSSURE,,35,FE,brand_240 running running lacet fitness runnin...
13902,375050,0.178916,375024,brand_1,SKATEBOARDING,SHOES - LOW (NON FOOTBALL),INFANT UNISEX,,18,FI,brand_1 skateboarding shoes low non football i...
...,...,...,...,...,...,...,...,...,...,...,...
17418,232069,0.114942,200626,brand_102,SPORTSWEAR,ENFANTS,,,128,GA,brand_102 sportswear enfants 128 ga
1837,175836,0.179572,175843,brand_56,ACCESSORIES,BAGS,CROSSOVERS,,OS,UA,brand_56 accessories bags crossovers os ua
2645,175907,0.273955,164906,brand_375,MOUNTAIN LIFESTYLE,SACS,,,OS,UN,brand_375 mountain lifestyle sacs os un
24554,200901,0.104575,200127,brand_102,FOOTBALL,ADULTES,,,L,HO,brand_102 football adultes l ho


These dataframes can be exported to csv files for further analysis.

## Make predictions on test set first 20 rows

In [79]:
limit = 20

test = pd.read_csv('data_technical_test/test_technical_test.csv', 
    na_values="",
    dtype={
        "incorrect_fedas_code": object, 
        "correct_fedas_code": object, 
    },
    parse_dates=["avalability_start_date", "avalability_end_date"])

test_result = fc.predict(test[:limit])
test_result

Normalizing features...
Predicting fedas codes...


Unnamed: 0,fedas,confidence
0,246124,0.697499
1,232747,0.569449
2,246398,0.9702
3,278115,0.075869
4,101733,0.118246
5,135798,0.653215
6,200367,0.81225
7,232124,1.0
8,200901,0.104575
9,275124,1.0


In [80]:
test.loc[test_result[test_result['confidence'] < 0.5].index]

Unnamed: 0,brand,model_code,model_label,commercial_label,incorrect_fedas_code,article_main_category,article_type,article_detail,comment,avalability_start_date,...,shipping_date,eco_participation,eco_furniture,multiple_of_order,minimum_multiple_of_order,net_weight,raw_weight,volume,size,accurate_gender
3,brand_397,1344333,ARMOUR MID KEYHOLE GRAPHIC,,278052.0,TRAINING,FEMME,ARMOUR MID KEYHOLE,,2021-01-20,...,,0.0,0.0,1,0,0.0,0.0,0.0,L,FE
4,brand_314,278115,POLE RENTAL KID EXCLUSIVE 2 FR,SCO POLE RENTAL KID EXCLU,101731.0,WINTERSPORTS/ACCESSORTS,BATONS DE SKI,,LOCATION,2020-09-01,...,20200224.0,0.0,0.0,1,1,0.32,0.45,0.0,070,UE
8,brand_102,3130705,MAILLOT TANARO 2.0,,232904.0,FOOTBALL,ADULTES,,,NaT,...,,0.0,0.0,0,0,0.0,0.0,0.0,L,HO
13,brand_182,KI0633,SAC FOURRE TOUT FORME TUBE,SAC FOURRE TOUT FORME TUB,,COLLECTIVITES,SAC,,,NaT,...,20200415.0,0.0,0.0,1,1,0.36,0.0,2.67,TU,UN
15,brand_257,1183A206,TIGER HORIZONIA,,314984.0,SPORTSTYLE,UNISEXE ADULTE,TIGER HORIZONIA,,2020-12-01,...,,0.0,0.0,1,0,0.0,0.0,0.0,10,HO
17,brand_366,9205,SHAKER SPORT,SHAKER SPORT,131199.0,FITNESS,PETIT ACCESSOIRE,UNISEXE ADULTE,,2019-11-30,...,20191209.0,0.0,0.0,1,1,0.14,0.15,0.0,22CM,UN
18,brand_17,2111A967,SB GLOBAL HOME TOP REPLICA,,236154.0,FOOTBALL,HOMME,SB GLOBAL HOME TOP,,2021-02-01,...,,0.0,0.0,1,0,0.0,0.0,0.0,2XL,HO
