## Imports

In [16]:
import pandas as pd
import json
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer

## Load data

In [17]:
test_data = json.load(open("test_products.json", "r"))
train_data = json.load(open("train_products.json", "r"))

In [18]:
df_train = pd.DataFrame.from_dict(train_data, orient="index")
df_test = pd.DataFrame.from_dict(test_data, orient="index")

## Preprocessing

In [4]:
#to remove
col_to_remove = ["name", "generic_name", "ingredient_origins", "calcium_100g", "brand"]

#categorical
col_categorical = ["nutrition_grade", "is_beverage"]

#int
col_int = ["additives_count", "non_recyclable_and_non_biodegradable_materials_count"]

#float
col_float = ['carbohydrates_100g', 'energy_kcal_100g', 'fat_100g', 'fiber_100g', 'proteins_100g', 'salt_100g', 'sodium_100g', 'sugars_100g', 
"est_co2_agriculture", "est_co2_consumption", "est_co2_distribution", "est_co2_packaging", "est_co2_processing", "est_co2_transportation"]

#to build one hot encoder from list
col_to_transform_from_list = ["categories_hierarchy", "selling_countries", "packaging_materials"]

#to build one hot encoder from dictionary
col_to_transform_from_dict = ["ingredients"]

target = "ecoscore_grade"

In [5]:
mlb = MultiLabelBinarizer()
import category_encoders as ce


def preprocessing(df):
    df = df.drop(columns = col_to_remove)
    df = df.drop(columns = col_to_transform_from_dict)

    for col in col_int+col_float:
        df.loc[df[df[col]=="unknown"].index, col] = np.NaN

    df[col_int] = df[col_int].astype(float)
    df[col_float] = df[col_float].astype(float)
    
    for col in col_to_transform_from_list:
        df = df.join(
                pd.DataFrame(
                    mlb.fit_transform(df.pop(col)),
                    index=df.index,
                    columns=mlb.classes_))
    
    return df

df_train[target] = df_train[target].astype(int)

encoder = ce.OrdinalEncoder(cols=col_categorical)   
df_train = encoder.fit_transform(df_train)
df_test["ecoscore_grade"] = 0
df_test = encoder.transform(df_test)
df_test = df_test.drop(columns="ecoscore_grade")

df_train = preprocessing(df_train)
df_test = preprocessing(df_test)

In [6]:
df_train = df_train.drop_duplicates()

In [7]:
cols = set(df_train.columns) - set(df_test.columns)
cols.remove("ecoscore_grade")
#cols.add("en:unknown")
df_train = df_train.drop(columns=cols)
cols = set(df_test.columns) - set(df_train.columns)
df_test = df_test.drop(columns=cols)

## Train xgboost

In [8]:
X = df_train.drop(columns=["ecoscore_grade"])
Y = df_train["ecoscore_grade"]

In [9]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

def grid_xgboost(X,y):
    gsc = GridSearchCV(
                estimator=xgb.XGBClassifier(),
                param_grid={
                            "n_estimators": [100],
                            "learning_rate": [0.15],
                            "max_depth": [8],
                            "min_child_weight": [1]},
                cv=5, scoring="f1_macro", verbose=0, n_jobs=-1)
    gsc.fit(X,y)
    return gsc

In [10]:
grid_search_xgboost = grid_xgboost(X,Y)

In [11]:
df_grid = pd.DataFrame(grid_search_xgboost.cv_results_)

In [12]:
df_grid

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_min_child_weight,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,356.467201,0.836915,0.479699,0.020991,0.15,8,1,100,"{'learning_rate': 0.15, 'max_depth': 8, 'min_c...",0.861013,0.855419,0.860996,0.852249,0.860961,0.858128,0.003646,1


## Predict

In [13]:
Pred = df_test.copy()
Pred["ecoscore_grade"] =  grid_search_xgboost.predict(df_test)


In [14]:
df_pred = pd.DataFrame(Pred["ecoscore_grade"].values, columns = ["target"])

In [15]:
#df_pred.to_json("1st submit.json") 0.72
#df_pred.to_json("2nd submit.json") 0.84303
df_pred.to_json("4 submit.json")

## Create submissions

In [47]:
scale_pos_weight = [0.04323695562591608, 0.668195846192343, 0.06837606837606838, 0.12549240292628025, 0.09409190371991247]

In [None]:
for i in range(5):
    df_pred_i = pd.DataFrame(np.array([i for c in range(1272)]), columns = ["target"])
    df_pred_i.to_json(f"submit{i}.json")

In [None]:
f1s = [0.016578, 0.16022, 0.0256, 0.0446, 0.0344]
results = []
for i in f1s:
    calcul = i*5/(2 - i*5)
    results.append(calcul)

In [None]:
results