## Imports

In [1]:
import pandas as pd
import json
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.6.1-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.1


## Load data

In [2]:
test_data = json.load(open("/content/drive/MyDrive/test_products.json", "r"))
train_data = json.load(open("/content/drive/MyDrive/train_products.json", "r"))

In [3]:
df_train = pd.DataFrame.from_dict(train_data, orient="index")
df_test = pd.DataFrame.from_dict(test_data, orient="index")

## Preprocessing

In [4]:
#to remove
col_to_remove = ["name", "generic_name", "ingredient_origins", "calcium_100g", "brand"]

#categorical
col_categorical = ["nutrition_grade", "is_beverage"]

#int
col_int = ["additives_count", "non_recyclable_and_non_biodegradable_materials_count"]

#float
col_float = ['carbohydrates_100g', 'energy_kcal_100g', 'fat_100g', 'fiber_100g', 'proteins_100g', 'salt_100g', 'sodium_100g', 'sugars_100g',
"est_co2_agriculture", "est_co2_consumption", "est_co2_distribution", "est_co2_packaging", "est_co2_processing", "est_co2_transportation"]

#to build one hot encoder from list
col_to_transform_from_list = ["categories_hierarchy", "selling_countries", "packaging_materials"]

#to build one hot encoder from dictionary
col_to_transform_from_dict = ["ingredients"]

target = "ecoscore_grade"

In [6]:
mlb = MultiLabelBinarizer()
import category_encoders as ce


def preprocessing(df):
    df = df.drop(columns = col_to_remove)
    #df = df.drop(columns = col_to_transform_from_dict)
    #df = df.drop(columns = col_to_transform_from_list)

    for col in col_int+col_float:
        df.loc[df[df[col]=="unknown"].index, col] = np.NaN

    df[col_int] = df[col_int].astype(float)
    df[col_float] = df[col_float].astype(float)

    for col in col_to_transform_from_list:
        df = df.join(
                pd.DataFrame(
                    mlb.fit_transform(df.pop(col)),
                    index=df.index,
                    columns=mlb.classes_))

    return df

df_train[target] = df_train[target].astype(int)

encoder = ce.OrdinalEncoder(cols=col_categorical)
df_train = encoder.fit_transform(df_train)
df_test["ecoscore_grade"] = 0
df_test = encoder.transform(df_test)
df_test = df_test.drop(columns="ecoscore_grade")

df_train = preprocessing(df_train)
df_test = preprocessing(df_test)

In [7]:
words = {}

for row in df_train["ingredients"].values:
    if row != "unknown" and row != []:
        for ing in row:
            word = ing["id"]
            try:
                words[word] += 1
            except:
                words[word] = 1

words_test = {}

for row in df_test["ingredients"].values:
    if row != "unknown" and row != []:
        for ing in row:
            word = ing["id"]
            try:
                words_test[word] += 1
            except:
                words_test[word] = 1
list_of_ing = []
for key_test in words_test.keys():
    try:
        if words[key_test] > 0:
            list_of_ing.append(key_test)
    except:
        continue

ingredients = {key: words[key] for key in list_of_ing}

sorted_ingredients = sorted(ingredients.items(), key=lambda x: x[1], reverse=True)
sorted_ingredients = [c[0] for c in sorted_ingredients[:50]]

for ing in sorted_ingredients:
    df_train[ing] = np.zeros(20835)
    df_test[ing]  = np.zeros(1272)

for i in df_test.index:
    if df_test.loc[[i],"ingredients"][0] == "unknown":
        df_test.loc[[i], sorted_ingredients] = np.NaN
        continue
    for ing in df_test.loc[[i],"ingredients"][0]:
        try:
            if ing["id"] in sorted_ingredients:
                df_test.loc[[i],ing["id"]] = float(ing["percent_estimate"])
        except:
            continue
for i in df_train.index:
    if df_train.loc[[i],"ingredients"][0] == "unknown":
        df_train.loc[[i], sorted_ingredients] = np.NaN
        continue
    for ing in df_train.loc[[i],"ingredients"][0]:
        try:
            if ing["id"] in sorted_ingredients:
                df_train.loc[[i],ing["id"]] = float(ing["percent_estimate"])
        except:
            continue

In [8]:
df_train = df_train.drop(columns = col_to_transform_from_dict)
df_test = df_test.drop(columns = col_to_transform_from_dict)
df_train = df_train.drop_duplicates()

In [9]:
cols = set(df_train.columns) - set(df_test.columns)
cols.remove("ecoscore_grade")
#cols.add("en:unknown")
df_train = df_train.drop(columns=cols)
cols = set(df_test.columns) - set(df_train.columns)
df_test = df_test.drop(columns=cols)

cols_train = list(df_train.columns)
cols_train.remove("ecoscore_grade")
df_test = df_test[cols_train]
cols_train.append("ecoscore_grade")
df_train = df_train[cols_train]

In [None]:
c = list(df_train.columns)
c2 = list(df_test.columns)
for i, j in zip(c,c2):
    if i != j:
        print(i, j)

## Train xgboost

In [10]:
X = df_train.drop(columns=["ecoscore_grade"])
Y = df_train["ecoscore_grade"]

In [13]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

def grid_xgboost(X,y):
    gsc = GridSearchCV(
                estimator=xgb.XGBClassifier(tree_method='gpu_hist'),
                param_grid={
                            "n_estimators": [80, 100, 120],
                            "learning_rate": [0.15],
                            "max_depth": [6, 10],
                            "min_child_weight": [1]},
                cv=5, scoring="f1_macro", verbose=0, n_jobs=-1)
    gsc.fit(X,y)
    return gsc

In [None]:
scale_pos_weight = [0.04323695562591608, 0.668195846192343, 0.06837606837606838, 0.12549240292628025, 0.09409190371991247]

In [14]:
grid_search_xgboost = grid_xgboost(X,Y)

In [15]:
df_grid = pd.DataFrame(grid_search_xgboost.cv_results_)

In [16]:
df_grid

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_min_child_weight,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,6.648514,0.268611,0.175753,0.057214,0.15,6,1,80,"{'learning_rate': 0.15, 'max_depth': 6, 'min_c...",0.856702,0.847351,0.859856,0.846955,0.851315,0.852436,0.005107,6
1,6.34106,0.399089,0.193161,0.05811,0.15,6,1,100,"{'learning_rate': 0.15, 'max_depth': 6, 'min_c...",0.857311,0.850804,0.865744,0.848241,0.854963,0.855413,0.006053,5
2,7.579941,0.586858,0.254416,0.086288,0.15,6,1,120,"{'learning_rate': 0.15, 'max_depth': 6, 'min_c...",0.852931,0.85389,0.865982,0.850895,0.853468,0.855433,0.005373,4
3,9.814767,1.121503,0.223461,0.072177,0.15,10,1,80,"{'learning_rate': 0.15, 'max_depth': 10, 'min_...",0.86372,0.854565,0.863738,0.856941,0.865137,0.86082,0.004236,2
4,11.310625,0.099328,0.22196,0.015126,0.15,10,1,100,"{'learning_rate': 0.15, 'max_depth': 10, 'min_...",0.865691,0.855922,0.863543,0.85848,0.863504,0.861428,0.003632,1
5,12.740221,0.240158,0.352613,0.115985,0.15,10,1,120,"{'learning_rate': 0.15, 'max_depth': 10, 'min_...",0.865192,0.855307,0.861942,0.859146,0.862255,0.860768,0.003335,3


## Predict

In [17]:
Pred = df_test.copy()
Pred["ecoscore_grade"] =  grid_search_xgboost.predict(df_test)


In [18]:
df_pred = pd.DataFrame(Pred["ecoscore_grade"].values, columns = ["target"])

In [21]:
#df_pred.to_json("1st submit.json") 0.72
#df_pred.to_json("2nd submit.json") 0.84303
df_pred.to_json("/content/4 submit.json")

## Create submissions

In [None]:
for i in range(5):
    df_pred_i = pd.DataFrame(np.array([i for c in range(1272)]), columns = ["target"])
    df_pred_i.to_json(f"submit{i}.json")

In [None]:
f1s = [0.016578, 0.16022, 0.0256, 0.0446, 0.0344]
results = []
for i in f1s:
    calcul = i*5/(2 - i*5)
    results.append(calcul)

In [None]:
results