## Imports

In [247]:
import pandas as pd
import json
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer

## Load data

In [248]:
test_data = json.load(open("test_products.json", "r"))
train_data = json.load(open("train_products.json", "r"))

In [249]:
df_train = pd.DataFrame.from_dict(train_data, orient="index")
df_test = pd.DataFrame.from_dict(test_data, orient="index")

## Preprocessing

In [250]:
#to remove
col_to_remove = ["name", "generic_name", "brand"]

#categorical
col_categorical = ["nutrition_grade", "is_beverage"]

#int
col_int = ["additives_count", "non_recyclable_and_non_biodegradable_materials_count"]

#float
col_float = ['carbohydrates_100g', 'energy_kcal_100g', 'fat_100g', 'fiber_100g', 'proteins_100g', 'salt_100g', 'sodium_100g', 'sugars_100g', "calcium_100g",
"est_co2_agriculture", "est_co2_consumption", "est_co2_distribution", "est_co2_packaging", "est_co2_processing", "est_co2_transportation"]

#to build one hot encoder from list
col_to_transform_from_list = ["categories_hierarchy", "selling_countries", "packaging_materials", "ingredient_origins"]

#to build one hot encoder from dictionary
col_to_transform_from_dict = ["ingredients"]

target = "ecoscore_grade"

In [251]:
mlb = MultiLabelBinarizer()
import category_encoders as ce


def preprocessing(df):
    df = df.drop(columns = col_to_remove)
    df = df.drop(columns = col_to_transform_from_dict)

    for col in col_int+col_float:
        df.loc[df[df[col]=="unknown"].index, col] = np.NaN

    df["nutrition_grade_value"] = np.zeros(len(df))
    for col in ["nutrition_grade"]:
        df.loc[df[df[col]=="unknown"].index, "nutrition_grade_value"] = np.NaN 
        df.loc[df[df[col]=="a"].index, "nutrition_grade_value"] = 5
        df.loc[df[df[col]=="b"].index, "nutrition_grade_value"] = 4 
        df.loc[df[df[col]=="c"].index, "nutrition_grade_value"] = 3 
        df.loc[df[df[col]=="d"].index, "nutrition_grade_value"] = 2 
        df.loc[df[df[col]=="e"].index, "nutrition_grade_value"] = 1   

    df[col_int] = df[col_int].astype(float)
    df[col_float] = df[col_float].astype(float)
    
    df["ingredient_origins2"] = df["ingredient_origins"].copy()

    for i in df.index:
        llista = list(df.loc[i, ["ingredient_origins"]].values[0].keys())
        llista = ["io"+ c for c in llista]
        df.at[i, "ingredient_origins"] = llista
    
    for col in col_to_transform_from_list:
        df = df.join(
                pd.DataFrame(
                    mlb.fit_transform(df.pop(col)),
                    index=df.index,
                    columns=mlb.classes_))
    
    cols_ing = ["per_"+col for col in df.columns if col[:2] == "io"]
    for col in cols_ing:
        df[col] = np.zeros(len(df))
    """zeros_array = np.zeros((df.shape[0], len(cols_ing)))

    zeros_df = pd.DataFrame(zeros_array, columns=cols_ing)

    df2 = pd.concat([df,zeros_df],axis=1)"""

    for i in df.index:
        for key, value in df.loc[[i],"ingredient_origins2"].values[0].items():
            df.loc[[i],"per_"+key] = float(value)
    
    df = df.drop(columns="ingredient_origins2")

    
    return df

df_train[target] = df_train[target].astype(int)

encoder = ce.OrdinalEncoder(cols=col_categorical)   
df_train = encoder.fit_transform(df_train)
df_test["ecoscore_grade"] = 0
df_test = encoder.transform(df_test)
df_test = df_test.drop(columns="ecoscore_grade")

df_train = preprocessing(df_train)
df_test = preprocessing(df_test)

  df[col] = np.zeros(len(df))
  df[col] = np.zeros(len(df))
  df[col] = np.zeros(len(df))
  df[col] = np.zeros(len(df))
  df[col] = np.zeros(len(df))
  df[col] = np.zeros(len(df))
  df[col] = np.zeros(len(df))
  df[col] = np.zeros(len(df))
  df[col] = np.zeros(len(df))
  df[col] = np.zeros(len(df))
  df[col] = np.zeros(len(df))
  df[col] = np.zeros(len(df))
  df[col] = np.zeros(len(df))
  df[col] = np.zeros(len(df))
  df[col] = np.zeros(len(df))
  df[col] = np.zeros(len(df))
  df[col] = np.zeros(len(df))
  df[col] = np.zeros(len(df))
  df[col] = np.zeros(len(df))
  df[col] = np.zeros(len(df))
  df.loc[[i],"per_"+key] = float(value)
  df.loc[[i],"per_"+key] = float(value)
  df.loc[[i],"per_"+key] = float(value)
  df.loc[[i],"per_"+key] = float(value)
  df.loc[[i],"per_"+key] = float(value)
  df.loc[[i],"per_"+key] = float(value)
  df.loc[[i],"per_"+key] = float(value)
  df.loc[[i],"per_"+key] = float(value)
  df.loc[[i],"per_"+key] = float(value)
  df.loc[[i],"per_"+key] = float(value)


In [252]:
cols = set(df_train.columns) - set(df_test.columns)
cols.remove("ecoscore_grade")
#cols.add("en:unknown")
df_train = df_train.drop(columns=cols)
cols = set(df_test.columns) - set(df_train.columns)
df_test = df_test.drop(columns=cols)

cols_train = list(df_train.columns)
cols_train.remove("ecoscore_grade")
df_test = df_test[cols_train]
cols_train.append("ecoscore_grade")
df_train = df_train[cols_train]

In [253]:
df_train = df_train.drop_duplicates()

## Train xgboost

In [254]:
X = df_train.drop(columns=["ecoscore_grade"])
Y = df_train["ecoscore_grade"]

In [271]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

def grid_xgboost(X,y):
    gsc = GridSearchCV(
                estimator=xgb.XGBClassifier(tree_method='gpu_hist'),
                param_grid={
                            "n_estimators": [110],
                            "learning_rate": [0.15],
                            "max_depth": [6,8,10],
                            #"min_child_weight": ],
                            #"colsample_bytree": [1],
                            #"subsample": [1],
                            #"gamma": [0.2],
                            #"reg_alpha": [0,0.1,0.001],
                            #"reg_lambda": [0,0.1,0.001]
                            },
                cv=5, scoring="f1_macro", verbose=0, n_jobs=-1)
    gsc.fit(X,y)
    return gsc

In [272]:
grid_search_xgboost = grid_xgboost(X,Y)

In [273]:
df_grid = pd.DataFrame(grid_search_xgboost.cv_results_)

In [274]:
df_grid

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,26.69581,0.2688,0.136144,0.003284,0.15,6,110,"{'learning_rate': 0.15, 'max_depth': 6, 'n_est...",0.884027,0.866882,0.875416,0.867744,0.872892,0.873392,0.006192,3
1,36.589871,3.715131,0.172907,0.005957,0.15,8,110,"{'learning_rate': 0.15, 'max_depth': 8, 'n_est...",0.889806,0.869339,0.88449,0.873815,0.876181,0.878726,0.007412,2
2,35.413295,8.07918,0.201469,0.005409,0.15,10,110,"{'learning_rate': 0.15, 'max_depth': 10, 'n_es...",0.884086,0.871445,0.883297,0.876574,0.881587,0.879398,0.004758,1


In [275]:
probas = grid_search_xgboost.predict_proba(df_test)

In [276]:
indexos = []
targets = []
for i, row in enumerate(probas):
    for j, el in enumerate(row):
        if el > 0.65:
            indexos.append(i)
            targets.append(j)


In [277]:
Pred = df_test.copy()
Pred = Pred.iloc[[str(c) for c in indexos]]
Pred["ecoscore_grade"] = np.array(targets)
Pred["Danone"] = 1

In [278]:
df_train2 = df_train.copy()
df_train2["Danone"] = 0

In [279]:
new_train = pd.concat((df_train2, Pred))

In [280]:
new_train.shape

(20582, 442)

In [281]:
X2 = new_train.drop(columns=["ecoscore_grade"])
Y2 = new_train["ecoscore_grade"]

In [282]:
grid_search_xgboost2 = grid_xgboost(X2,Y2)

In [283]:
df_grid = pd.DataFrame(grid_search_xgboost2.cv_results_)
df_grid

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,27.231481,0.13092,0.145946,0.006952,0.15,6,110,"{'learning_rate': 0.15, 'max_depth': 6, 'n_est...",0.877168,0.873728,0.876316,0.866,0.899709,0.878584,0.011273,3
1,37.6025,4.496663,0.190355,0.005505,0.15,8,110,"{'learning_rate': 0.15, 'max_depth': 8, 'n_est...",0.878792,0.878381,0.881037,0.875039,0.904875,0.883625,0.010797,2
2,36.45314,8.697238,0.219864,0.00763,0.15,10,110,"{'learning_rate': 0.15, 'max_depth': 10, 'n_es...",0.881954,0.877089,0.881087,0.878607,0.905299,0.884807,0.010392,1


In [284]:
Pred_concat = df_test.copy()
Pred_concat["Danone"] = 1
Pred_concat["ecoscore_grade"] =  grid_search_xgboost2.predict(Pred_concat)
df_pred = pd.DataFrame(Pred_concat["ecoscore_grade"].values, columns = ["target"])
df_pred.to_json("7 submit.json")

In [285]:
Pred_concat["ecoscore_grade"].value_counts()

1    867
3    173
4    112
2     77
0     43
Name: ecoscore_grade, dtype: int64

In [286]:
from collections import Counter

value_counts = Counter(targets)

# Print the value counts
for value, count in value_counts.items():
    print(f"{value}: {count}")

1: 855
2: 71
3: 166
0: 40
4: 111


## Predict

In [138]:
Pred = df_test.copy()
Pred["ecoscore_grade"] =  grid_search_xgboost.predict(df_test)


In [139]:
df_pred = pd.DataFrame(Pred["ecoscore_grade"].values, columns = ["target"])

In [140]:
#df_pred.to_json("1st submit.json") 0.72
#df_pred.to_json("2nd submit.json") 0.84303
df_pred.to_json("6 submit.json")