In [None]:
import pandas as pd
import json
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from google.colab import drive
drive.mount('/content/drive')

In [None]:
test_data = json.load(open("/content/drive/MyDrive/test_products.json", "r"))
train_data = json.load(open("/content/drive/MyDrive/train_products.json", "r"))

In [None]:
df_train = pd.DataFrame.from_dict(train_data, orient="index")
df_test = pd.DataFrame.from_dict(test_data, orient="index")

In [None]:
#to remove
col_to_remove = ["name", "generic_name", "ingredient_origins", "calcium_100g", "brand"]

#categorical
col_categorical = ["nutrition_grade", "is_beverage"]

#int
col_int = ["additives_count", "non_recyclable_and_non_biodegradable_materials_count"]

#float
col_float = ['carbohydrates_100g', 'energy_kcal_100g', 'fat_100g', 'fiber_100g', 'proteins_100g', 'salt_100g', 'sodium_100g', 'sugars_100g',
"est_co2_agriculture", "est_co2_consumption", "est_co2_distribution", "est_co2_packaging", "est_co2_processing", "est_co2_transportation"]

#to build one hot encoder from list
col_to_transform_from_list = ["categories_hierarchy", "selling_countries", "packaging_materials"]

#to build one hot encoder from dictionary
col_to_transform_from_dict = ["ingredients"]

target = "ecoscore_grade"

In [None]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
mlb = MultiLabelBinarizer()
import category_encoders as ce


def preprocessing(df):
    df = df.drop(columns = col_to_remove)
    df = df.drop(columns = col_to_transform_from_dict)

    for col in col_int+col_float:
        df.loc[df[df[col]=="unknown"].index, col] = np.NaN

    df[col_int] = df[col_int].astype(float)
    df[col_float] = df[col_float].astype(float)

    for col in col_to_transform_from_list:
        df = df.join(
                pd.DataFrame(
                    mlb.fit_transform(df.pop(col)),
                    index=df.index,
                    columns=mlb.classes_))

    return df

df_train[target] = df_train[target].astype(int)

encoder = ce.OrdinalEncoder(cols=col_categorical)
df_train = encoder.fit_transform(df_train)
df_test["ecoscore_grade"] = 0
df_test = encoder.transform(df_test)
df_test = df_test.drop(columns="ecoscore_grade")

df_train = preprocessing(df_train)
df_test = preprocessing(df_test)

In [None]:
cols = set(df_train.columns) - set(df_test.columns)
cols.remove("ecoscore_grade")
#cols.add("en:unknown")
df_train = df_train.drop(columns=cols)
cols = set(df_test.columns) - set(df_train.columns)
df_test = df_test.drop(columns=cols)

In [None]:
X = df_train.drop(columns=["ecoscore_grade"])
Y = df_train["ecoscore_grade"]

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

def grid_xgboost(X,y):
    gsc = GridSearchCV(
                estimator=xgb.XGBClassifier(tree_method='gpu_hist'),
                param_grid={
                            "n_estimators": [100],
                            "learning_rate": [0.15],
                            "max_depth": [8, 10],
                            "min_child_weight": [1]},
                cv=5, scoring="f1_macro", verbose=0, n_jobs=-1)
    gsc.fit(X,y)
    return gsc
"""{
                            "n_estimators": [100],
                            "learning_rate": [0.1],
                            "max_depth": [10],
                            "colsample_bytree": [0.8],
                            "subsample": [0.7],
                            "min_child_weight": [1],
                            "gamma": [0,0.1,1],
                            "reg_alpha": [0,0.1,0.001],
                            "reg_lambda": [0,0.1,0.001]
                            }"""

In [None]:
grid_search_xgboost = grid_xgboost(X,Y)

In [None]:
df_grid = pd.DataFrame(grid_search_xgboost.cv_results_)

In [None]:
df_grid

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_min_child_weight,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,7.372704,1.818463,0.167187,0.022087,0.15,8,1,100,"{'learning_rate': 0.15, 'max_depth': 8, 'min_c...",0.862766,0.85138,0.858485,0.853064,0.855851,0.856309,0.004036,1


In [None]:
Pred = df_test.copy()
Pred["ecoscore_grade"] =  grid_search_xgboost.predict(df_test)
df_pred = pd.DataFrame(Pred["ecoscore_grade"].values, columns = ["target"])
df_pred.to_json("/content/5 submit.json")