## Imports

In [1]:
import pandas as pd
import json
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer

## Load data

In [2]:
test_data = json.load(open("test_products.json", "r"))
train_data = json.load(open("train_products.json", "r"))

In [3]:
df_train = pd.DataFrame.from_dict(train_data, orient="index")
df_test = pd.DataFrame.from_dict(test_data, orient="index")

## Preprocessing

In [4]:
#to remove
col_to_remove = ["name", "generic_name", "brand"]

#categorical
col_categorical = ["nutrition_grade", "is_beverage"]

#int
col_int = ["additives_count", "non_recyclable_and_non_biodegradable_materials_count"]

#float
col_float = ['carbohydrates_100g', 'energy_kcal_100g', 'fat_100g', 'fiber_100g', 'proteins_100g', 'salt_100g', 'sodium_100g', 'sugars_100g', "calcium_100g",
"est_co2_agriculture", "est_co2_consumption", "est_co2_distribution", "est_co2_packaging", "est_co2_processing", "est_co2_transportation"]

#to build one hot encoder from list
col_to_transform_from_list = ["categories_hierarchy", "selling_countries", "packaging_materials", "ingredient_origins"]

#to build one hot encoder from dictionary
col_to_transform_from_dict = ["ingredients"]

target = "ecoscore_grade"

In [5]:
mlb = MultiLabelBinarizer()
import category_encoders as ce


def preprocessing(df):
    df = df.drop(columns = col_to_remove)
    df = df.drop(columns = col_to_transform_from_dict)

    for col in col_int+col_float:
        df.loc[df[df[col]=="unknown"].index, col] = np.NaN

    df["nutrition_grade_value"] = np.zeros(len(df))
    for col in ["nutrition_grade"]:
        df.loc[df[df[col]=="unknown"].index, "nutrition_grade_value"] = np.NaN 
        df.loc[df[df[col]=="a"].index, "nutrition_grade_value"] = 5
        df.loc[df[df[col]=="b"].index, "nutrition_grade_value"] = 4 
        df.loc[df[df[col]=="c"].index, "nutrition_grade_value"] = 3 
        df.loc[df[df[col]=="d"].index, "nutrition_grade_value"] = 2 
        df.loc[df[df[col]=="e"].index, "nutrition_grade_value"] = 1   

    df[col_int] = df[col_int].astype(float)
    df[col_float] = df[col_float].astype(float)

    for i in df.index:
        llista = list(df.loc[i, ["ingredient_origins"]].values[0].keys())
        llista = ["io"+ c for c in llista]
        df.at[i, "ingredient_origins"] = llista
    
    for col in col_to_transform_from_list:
        df = df.join(
                pd.DataFrame(
                    mlb.fit_transform(df.pop(col)),
                    index=df.index,
                    columns=mlb.classes_))
    
    return df

df_train[target] = df_train[target].astype(int)

encoder = ce.OrdinalEncoder(cols=col_categorical)   
df_train = encoder.fit_transform(df_train)
df_test["ecoscore_grade"] = 0
df_test = encoder.transform(df_test)
df_test = df_test.drop(columns="ecoscore_grade")

df_train = preprocessing(df_train)
df_test = preprocessing(df_test)

In [6]:
df_train = df_train.drop_duplicates()

In [7]:
cols = set(df_train.columns) - set(df_test.columns)
cols.remove("ecoscore_grade")
#cols.add("en:unknown")
df_train = df_train.drop(columns=cols)
cols = set(df_test.columns) - set(df_train.columns)
df_test = df_test.drop(columns=cols)

## Train xgboost

In [8]:
X = df_train.drop(columns=["ecoscore_grade"])
Y = df_train["ecoscore_grade"]

In [65]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

def grid_xgboost(X,y):
    gsc = GridSearchCV(
                estimator=xgb.XGBClassifier(tree_method='gpu_hist'),
                param_grid={
                            "n_estimators": [110],
                            "learning_rate": [0.15],
                            "max_depth": [8]
                            #"min_child_weight": [0.1, 0.5,1],
                            #"colsample_bytree": [1],
                            #"subsample": [1],
                            #"gamma": [0.2],
                            #"reg_alpha": [0,0.1,0.001],
                            #"reg_lambda": [0,0.1,0.001]
                            },
                cv=5, scoring="f1_macro", verbose=0, n_jobs=-1)
    gsc.fit(X,y)
    return gsc

In [66]:
grid_search_xgboost = grid_xgboost(X,Y)

In [67]:
df_grid = pd.DataFrame(grid_search_xgboost.cv_results_)

In [68]:
df_grid

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_booster,param_learning_rate,param_max_delta_step,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,32.542704,0.046872,0.16963,0.01404,gbtree,0.15,1,8,110,"{'booster': 'gbtree', 'learning_rate': 0.15, '...",0.887029,0.870334,0.879669,0.87634,0.879593,0.878593,0.005415,3
1,32.007077,0.293414,0.15669,0.006012,gbtree,0.15,5,8,110,"{'booster': 'gbtree', 'learning_rate': 0.15, '...",0.886317,0.87168,0.882841,0.874876,0.879151,0.878973,0.005271,1
2,22.326087,7.834918,0.166045,0.02228,gbtree,0.15,10,8,110,"{'booster': 'gbtree', 'learning_rate': 0.15, '...",0.886317,0.87168,0.882841,0.874876,0.879151,0.878973,0.005271,1


## Use prections with high probabilites to train a new xgboost

In [69]:
probas = grid_search_xgboost.predict_proba(df_test)

In [70]:
indexos = []
targets = []
for i, row in enumerate(probas):
    for j, el in enumerate(row):
        if el > 0.65:
            indexos.append(i)
            targets.append(j)


In [71]:
Pred = df_test.copy()
Pred = Pred.iloc[[str(c) for c in indexos]]
Pred["ecoscore_grade"] = np.array(targets)
#Pred["Danone"] = 1

In [72]:
df_train2 = df_train.copy()
#df_train2["Danone"] = 0

In [73]:
new_train = pd.concat((df_train2, Pred))

In [74]:
new_train.shape

(21014, 395)

In [75]:
X2 = new_train.drop(columns=["ecoscore_grade"])
Y2 = new_train["ecoscore_grade"]

In [76]:
grid_search_xgboost2 = grid_xgboost(X2,Y2)

In [77]:
df_grid = pd.DataFrame(grid_search_xgboost2.cv_results_)
df_grid

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_booster,param_learning_rate,param_max_delta_step,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,32.909557,0.122644,0.17756,0.017237,gbtree,0.15,1,8,110,"{'booster': 'gbtree', 'learning_rate': 0.15, '...",0.882123,0.880435,0.880769,0.877801,0.903631,0.884952,0.009444,1
1,32.660294,0.19674,0.155942,0.007205,gbtree,0.15,5,8,110,"{'booster': 'gbtree', 'learning_rate': 0.15, '...",0.878759,0.877617,0.877478,0.878515,0.902262,0.882926,0.00968,2
2,22.869799,8.030239,0.156233,0.012387,gbtree,0.15,10,8,110,"{'booster': 'gbtree', 'learning_rate': 0.15, '...",0.878759,0.877617,0.877478,0.878515,0.902262,0.882926,0.00968,2


In [78]:
Pred_concat = df_test.copy()
#Pred_concat["Danone"] = 1
Pred_concat["ecoscore_grade"] =  grid_search_xgboost2.predict(Pred_concat)
df_pred = pd.DataFrame(Pred_concat["ecoscore_grade"].values, columns = ["target"])
df_pred.to_json("7 submit.json")

In [79]:
Pred_concat["ecoscore_grade"].value_counts()

1    861
3    170
4    114
2     78
0     49
Name: ecoscore_grade, dtype: int64