In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import resample
from sklearn.cluster import KMeans 
import datetime as dt
from math import ceil, sqrt
from Utils import *
pd.set_option('max_colwidth', None)
from EvaluationMetric import *
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

In [64]:
def comparemodels(train_X, train_y, val_X, test_X, generaldescription, unbalanced = True):
    res = []

    model1 = DecisionTreeClassifier(max_leaf_nodes= 6, random_state=1)
    model1.fit(train_X, train_y)
    preds_val1 = model1.predict(val_X)
    model_test1 = model1.predict(test_X)
    description = generaldescription+"max_leaf_nodes : 6 + with date columns"
    metrics1 = listmetrics(val_y, preds_val1, "DecisionTreeClassifier", description)
    m1Public = [0.666666666, 0.666666666, 0.006575486]
    m1Private = [0.661016949, 0.661016949, 0.006863327]
    res.append(metrics1)
    df1 = getscoreforcsv(index_val, model_test1, name_file = "TreeClass5leaf.csv")


    model2 = RandomForestClassifier(n_estimators=36, criterion = 'entropy', random_state=1)
    model2.fit(train_X, train_y)
    preds_val2 = model2.predict(val_X)
    model_test2 = model2.predict(test_X)
    description = generaldescription+"n_estimators : 36 + Entropy"
    metrics2 = listmetrics(val_y, preds_val2, "Random Forest Classifier", description)
    m2Public = [0.690909090, 0.666666666, 0.007513148]
    m2Private = [0.649122807, 0.649122807, 0.007794933]
    res.append(metrics2)
    df2 = getscoreforcsv(index_val, model_test2, name_file = "RandomForestN36Entropy.csv")


    ##################ICi donne un 0.71 de scrore !!! mtn comprendre pq en diminuant les feature si bon score mais que pour lui !############
    #Ne fonctionne pas bien quadn le training set est entier. 
    if (unbalanced) : 
        model2 = XGBClassifier(scale_pos_weight=22, random_state=1)
        description = generaldescription+"scale_pos_weight = 22"
    else : 
        model2 = XGBClassifier(random_state=1)
        description = generaldescription+"on balanced data"
    model2.fit(train_X, train_y)
    preds_val2 = model2.predict(val_X)
    model_test2 = model2.predict(test_X)
    metrics2 = listmetrics(val_y, preds_val2, "XGBClassifier", description)
    m3Public = [0.677419354, 0.711864406, 0.092764378]
    m3Private = [0.676923076,  0.715447154, 0.096564531]
    res.append(metrics2)
    df2 = getscoreforcsv(index_val, model_test2, name_file = "XGBClassifierW22.csv")
    
    if (unbalanced) :
        
        model2 = RandomForestClassifier(n_estimators=36, criterion = 'entropy', class_weight = {0: 95469, 1 : 193}, random_state=1)
        model2.fit(train_X, train_y)
        preds_val2 = model2.predict(val_X)
        model_test2 = model2.predict(test_X)
        description = generaldescription+"n_estimators : 36 + Entropy"
        metrics2 = listmetrics(val_y, preds_val2, "Random Forest ClassifierWeighted500", description)
        m500Public = [0.678571428]
        m500Private = [0.666666666]
        res.append(metrics2)
        df2 = getscoreforcsv(index_val, model_test2, name_file = "RandomForestWeigthed500.csv")

        model2 = RandomForestClassifier(n_estimators=36, criterion = 'entropy', class_weight = {0: 22, 1 : 1}, random_state=1)
        model2.fit(train_X, train_y)
        preds_val2 = model2.predict(val_X)
        model_test2 = model2.predict(test_X)
        description = generaldescription+"n_estimators : 36 + Entropy"
        metrics2 = listmetrics(val_y, preds_val2, "Random Forest ClassifierWeighted22", description)
        m22Public = [0.703703703]
        m22Private = [0.654867256]
        res.append(metrics2)
        df2 = getscoreforcsv(index_val, model_test2, name_file = "RandomForestWeigthed22.csv")

        #oversamplesimple
        X_con = pd.concat([train_X, train_y], axis=1) 
        not_fraud = X_con[X_con.FraudResult==0]
        fraud = X_con[X_con.FraudResult==1]
        fraud_upsampled = resample(fraud, replace=True, n_samples=len(not_fraud), # match number in majority class
                                random_state=1) # reproducible results
        upsampled = pd.concat([not_fraud, fraud_upsampled])
        train_y_over_sampled = upsampled.FraudResult
        train_X_over_sampled = upsampled.drop('FraudResult', axis=1)
        upsampledmodel = RandomForestClassifier(n_estimators=36, criterion = 'entropy', random_state=1)
        upsampledmodel.fit(train_X_over_sampled, train_y_over_sampled)
        upsampled_pred = upsampledmodel.predict(val_X)
        model_test = upsampledmodel.predict(OH_X_test)
        description = generaldescription+"upsampled, n_estimators=36, criterion = entropy"
        metrics = listmetrics(val_y, upsampled_pred, "RandomForestClassifierUpperSampling", description)
        m4Public = [0.678571428, 0.642857142]
        m4Private = [0.637168141, 0.620689655]
        res.append(metrics)
        df = getscoreforcsv(index_val, model_test, name_file = "RandomForestClassifierUpSample.csv")

        #Undersampling
        not_fraud_downsampled  = resample(not_fraud, replace=False, n_samples=len(fraud), # match number in minority  class
                                random_state=1) # reproducible results
        downsampled  = pd.concat([not_fraud_downsampled , fraud])
        train_y_undersampled = downsampled.FraudResult
        train_X_undersampled = downsampled.drop('FraudResult', axis=1)
        undersampled = RandomForestClassifier(n_estimators=36, criterion = 'entropy', random_state=1)
        undersampled.fit(train_X_over_sampled, train_y_over_sampled)
        undersampled_pred = undersampled.predict(val_X)
        model_test = undersampled.predict(OH_X_test)
        description = generaldescription+"undersampled, n_estimators=36, criterion = entropy"
        metrics = listmetrics(val_y, undersampled_pred, "RandomForestClassifierUnderSampling", description)
        m5Public = [0.549019607, 0.642857142]
        m5Private = [0.480769230, 0.620689655]
        res.append(metrics)
        df = getscoreforcsv(index_val, model_test, name_file = "RandomForestClassifierUndersampled.csv")
        
        loldPublics = [m1Public, m2Public, m3Public, m500Public,m22Public, m4Public, m5Public]
        loldPrivate = [m1Private, m2Private, m3Private, m500Private, m22Private, m4Private, m5Private]
        
    else:
        loldPublics = [m1Public, m2Public, m3Public]
        loldPrivate = [m1Private, m2Private,  m3Private]
        


    dfres = listmetricsintodf(res)
    dfres["OldPublicScore"] = loldPublics
    dfres["OldPrivateScore"] = loldPrivate
    dfres["MeanOurMetrics"] = dfres[['Precision', 'Recall','F1-score','Mcc']].mean(axis=1)
    dfres.to_csv('output/resultatsTestEverything.csv', mode='a',index=False,header = False) 
    return dfres

# Test de different model with different Data Here
## 1. The reference model

In [65]:
def_feature = pd.read_csv("input/Xente_Variable_Definitions.csv")
raw_data = pd.read_csv("input/training.csv")
X_test = pd.read_csv("input/test.csv")
sample_submission = pd.read_csv("input/sample_submission.csv")

#attribut initialization
cols_unique_value = [] #Will be droped
for col in raw_data.columns : 
    if len(raw_data[col].unique()) == 1 :
        cols_unique_value.append(col)

medium_cardianlity_cols = ["ProductId"]

#Data transformation
raw_data['TransactionStartTime'] = pd.to_datetime(raw_data['TransactionStartTime'])
X_test['TransactionStartTime'] = pd.to_datetime(X_test['TransactionStartTime'])

#Data cleaning
data = raw_data.copy()
data = data.dropna(axis=0) #Drop observations/rows with missing values
X_test=X_test.dropna(axis=0)
data.drop(cols_unique_value, axis=1, inplace=True)
X_test.drop(cols_unique_value, axis=1, inplace=True)

#Prepare index for the submission
index_val = list(X_test.TransactionId.values.tolist())

#Set the df index to the Transction Id
data = transactioId_to_index(data)
X_test = transactioId_to_index(X_test)

#Adding data
data = adding_date_col(data, 'TransactionStartTime')
X_test = adding_date_col(X_test, 'TransactionStartTime')


# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in data.columns if data[cname].nunique() < 15 and 
                        data[cname].dtype == "object"]
low_cardinality_cols.append('PricingStrategy')

#transform columns BatchId_54 to number
    #transform columns BatchId_54 to number
id_cols = data.filter(like="Id").columns.tolist()
data[id_cols] = data[id_cols].astype(str).apply(lambda x: x.str.replace(x.name + "_", "")).astype(int)
X_test[id_cols] = X_test[id_cols].astype(str).apply(lambda x: x.str.replace(x.name + "_", "")).astype(int)

data['PricingStrategy'] = data['PricingStrategy'].astype('int')
X_test['PricingStrategy'] = X_test['PricingStrategy'].astype('int')
data['Value'] = data['Value'].astype('float')
X_test['Value'] = X_test['Value'].astype('float')

X_test['PricingStrategy'] = X_test['PricingStrategy'].astype('str')
data['PricingStrategy'] = data['PricingStrategy'].astype('str')

#Data splitting
y = data.FraudResult #The target label
X = data.copy()
X.drop(['FraudResult'], axis=1, inplace=True) #Only the features data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

cat_cols = [col for col in X.columns if X[col].dtype == "object"]#liste of obejct columns
cat_cols.append("PricingStrategy")#pcq mm si c'est un chiffre il faut le considérer comme une catégorie


OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(train_X[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(val_X[low_cardinality_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test[low_cardinality_cols]))
OH_cols_train.columns = OH_encoder.get_feature_names_out()
OH_cols_valid.columns = OH_encoder.get_feature_names_out()
OH_cols_test.columns = OH_encoder.get_feature_names_out()

# One-hot encoding removed index; put it back
OH_cols_train.index = train_X.index
OH_cols_valid.index = val_X.index
OH_cols_test.index = X_test.index

OH_cols_train[list(OH_cols_train.columns)] = OH_cols_train[list(OH_cols_train.columns)].astype(int)
OH_cols_valid[list(OH_cols_valid.columns)] = OH_cols_valid[list(OH_cols_valid.columns)].astype(int)
OH_cols_test[list(OH_cols_test.columns)] = OH_cols_test[list(OH_cols_test.columns)].astype(int)

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = train_X.drop(low_cardinality_cols, axis=1)
num_X_valid = val_X.drop(low_cardinality_cols, axis=1)
num_X_test = X_test.drop(low_cardinality_cols, axis=1)

OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

# Problème de string
OH_X_train.drop(['TransactionStartTime'], inplace=True, axis=1)
OH_X_valid.drop(['TransactionStartTime'], inplace=True, axis=1)
OH_X_test.drop(['TransactionStartTime'], inplace=True, axis=1)
OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)
OH_X_test.columns = OH_X_test.columns.astype(str)

train_X = OH_X_train
val_X = OH_X_valid
test_X = OH_X_test
X_entier = pd.concat([train_X, val_X])

referenceresult = comparemodels(train_X, train_y, val_X, test_X, "The reference")
referenceresult["PublicScore"] = [0.666666666, 0.690909090, 0.677419354, 0.678571428, 0.703703703, 0.678571428, 0.549019607]
referenceresult["PrivateScore"] = [0.661016949, 0.649122807,  0.676923076,0.666666666, 0.654867256, 0.637168141, 0.480769230]
referenceresult

done
done
done
done
done
done
done


Unnamed: 0,Model,Description,Date,Precision,Recall,F1-score,LogLoss,Mcc,PublicScore,PrivateScore,OldPublicScore,OldPrivateScore,MeanOurMetrics
0,DecisionTreeClassifier,The referencemax_leaf_nodes : 6 + with date columns,2023-05-07 16:29:46.815175,0.891892,0.825,0.857143,0.016578,0.857566,0.666667,0.661017,"[0.666666666, 0.666666666, 0.006575486]","[0.661016949, 0.661016949, 0.006863327]",0.8579
1,Random Forest Classifier,The referencen_estimators : 36 + Entropy,2023-05-07 16:29:47.996356,0.813953,0.875,0.843373,0.019592,0.843655,0.690909,0.649123,"[0.69090909, 0.666666666, 0.007513148]","[0.649122807, 0.649122807, 0.007794933]",0.843995
2,XGBClassifier,The referencescale_pos_weight = 22,2023-05-07 16:29:49.487206,0.822222,0.925,0.870588,0.016578,0.871874,0.677419,0.676923,"[0.677419354, 0.711864406, 0.092764378]","[0.676923076, 0.715447154, 0.096564531]",0.872421
3,Random Forest ClassifierWeighted500,The referencen_estimators : 36 + Entropy,2023-05-07 16:29:50.689128,0.804348,0.925,0.860465,0.018085,0.862324,0.678571,0.666667,[0.678571428],[0.666666666],0.863034
4,Random Forest ClassifierWeighted22,The referencen_estimators : 36 + Entropy,2023-05-07 16:29:51.854001,0.804348,0.925,0.860465,0.018085,0.862324,0.703704,0.654867,[0.703703703],[0.654867256],0.863034
5,RandomForestClassifierUpperSampling,"The referenceupsampled, n_estimators=36, criterion = entropy",2023-05-07 16:29:54.212174,0.777778,0.875,0.823529,0.022606,0.824649,0.678571,0.637168,"[0.678571428, 0.642857142]","[0.637168141, 0.620689655]",0.825239
6,RandomForestClassifierUnderSampling,"The referenceundersampled, n_estimators=36, criterion = entropy",2023-05-07 16:29:56.454271,0.777778,0.875,0.823529,0.022606,0.824649,0.54902,0.480769,"[0.549019607, 0.642857142]","[0.48076923, 0.620689655]",0.825239


## 2. Play with deleting the low MI columns

In [66]:
# Supprimer les MI trop faible
def_feature = pd.read_csv("input/Xente_Variable_Definitions.csv")
raw_data = pd.read_csv("input/training.csv")
X_test = pd.read_csv("input/test.csv")
sample_submission = pd.read_csv("input/sample_submission.csv")

#attribut initialization
cols_unique_value = [] #Will be droped
for col in raw_data.columns : 
    if len(raw_data[col].unique()) == 1 :
        cols_unique_value.append(col)

medium_cardianlity_cols = ["ProductId"]

#Data transformation
raw_data['TransactionStartTime'] = pd.to_datetime(raw_data['TransactionStartTime'])
X_test['TransactionStartTime'] = pd.to_datetime(X_test['TransactionStartTime'])

#Data cleaning
data = raw_data.copy()
data = data.dropna(axis=0) #Drop observations/rows with missing values
X_test=X_test.dropna(axis=0)
data.drop(cols_unique_value, axis=1, inplace=True)
X_test.drop(cols_unique_value, axis=1, inplace=True)

#Prepare index for the submission
index_val = list(X_test.TransactionId.values.tolist())

#Set the df index to the Transction Id
data = transactioId_to_index(data)
X_test = transactioId_to_index(X_test)

#Adding data
data = adding_date_col(data, 'TransactionStartTime')
X_test = adding_date_col(X_test, 'TransactionStartTime')


# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in data.columns if data[cname].nunique() < 15 and 
                        data[cname].dtype == "object"]
low_cardinality_cols.append('PricingStrategy')

#transform columns BatchId_54 to number
    #transform columns BatchId_54 to number
id_cols = data.filter(like="Id").columns.tolist()
data[id_cols] = data[id_cols].astype(str).apply(lambda x: x.str.replace(x.name + "_", "")).astype(int)
X_test[id_cols] = X_test[id_cols].astype(str).apply(lambda x: x.str.replace(x.name + "_", "")).astype(int)

data['PricingStrategy'] = data['PricingStrategy'].astype('int')
X_test['PricingStrategy'] = X_test['PricingStrategy'].astype('int')
data['Value'] = data['Value'].astype('float')
X_test['Value'] = X_test['Value'].astype('float')

X_test['PricingStrategy'] = X_test['PricingStrategy'].astype('str')
data['PricingStrategy'] = data['PricingStrategy'].astype('str')

#Data splitting
y = data.FraudResult #The target label
X = data.copy()
X.drop(['FraudResult'], axis=1, inplace=True) #Only the features data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

cat_cols = [col for col in X.columns if X[col].dtype == "object"]#liste of obejct columns
cat_cols.append("PricingStrategy")#pcq mm si c'est un chiffre il faut le considérer comme une catégorie


OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(train_X[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(val_X[low_cardinality_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test[low_cardinality_cols]))
OH_cols_train.columns = OH_encoder.get_feature_names_out()
OH_cols_valid.columns = OH_encoder.get_feature_names_out()
OH_cols_test.columns = OH_encoder.get_feature_names_out()

# One-hot encoding removed index; put it back
OH_cols_train.index = train_X.index
OH_cols_valid.index = val_X.index
OH_cols_test.index = X_test.index

OH_cols_train[list(OH_cols_train.columns)] = OH_cols_train[list(OH_cols_train.columns)].astype(int)
OH_cols_valid[list(OH_cols_valid.columns)] = OH_cols_valid[list(OH_cols_valid.columns)].astype(int)
OH_cols_test[list(OH_cols_test.columns)] = OH_cols_test[list(OH_cols_test.columns)].astype(int)

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = train_X.drop(low_cardinality_cols, axis=1)
num_X_valid = val_X.drop(low_cardinality_cols, axis=1)
num_X_test = X_test.drop(low_cardinality_cols, axis=1)

OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

# Problème de string
OH_X_train.drop(['TransactionStartTime'], inplace=True, axis=1)
OH_X_valid.drop(['TransactionStartTime'], inplace=True, axis=1)
OH_X_test.drop(['TransactionStartTime'], inplace=True, axis=1)
OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)
OH_X_test.columns = OH_X_test.columns.astype(str)

train_X = OH_X_train
val_X = OH_X_valid
test_X = OH_X_test
X_entier = pd.concat([train_X, val_X])

mi_scores = make_mi_scores(X_entier , y)

verylowMI = list(mi_scores[mi_scores<0.0001].index)
lowMI = list(mi_scores[(mi_scores>=0.0001)&(mi_scores<0.001)].index)
mediumMI = list(mi_scores[(mi_scores>=0.001)&(mi_scores<0.01)].index)
highMI = list(mi_scores[mi_scores>=0.01].index)

### 2.1 Delecte the very low MI columns

In [67]:
train_X.drop(verylowMI, inplace=True, axis=1, errors='ignore')
val_X.drop(verylowMI, inplace=True, axis=1, errors='ignore')
test_X.drop(verylowMI, inplace=True, axis=1, errors='ignore')
X_entier.drop(verylowMI, inplace=True, axis=1, errors='ignore')

withoutverylowMI = comparemodels(train_X, train_y, val_X, test_X, "Delete very low MI")
withoutverylowMI["PublicScore"] = [0.666666666, 0.666666666, 0.711864406, 0.654545454, 0.595744680, 0.642857142, 0.642857142]
withoutverylowMI["PrivateScore"] = [0.661016949, 0.649122807, 0.715447154, 0.666666666, 0.647619047, 0.620689655, 0.620689655]
withoutverylowMI

done
done
done
done
done
done
done


Unnamed: 0,Model,Description,Date,Precision,Recall,F1-score,LogLoss,Mcc,PublicScore,PrivateScore,OldPublicScore,OldPrivateScore,MeanOurMetrics
0,DecisionTreeClassifier,Delete very low MImax_leaf_nodes : 6 + with date columns,2023-05-07 16:30:13.941221,0.942857,0.825,0.88,0.013564,0.88178,0.666667,0.661017,"[0.666666666, 0.666666666, 0.006575486]","[0.661016949, 0.661016949, 0.006863327]",0.882409
1,Random Forest Classifier,Delete very low MIn_estimators : 36 + Entropy,2023-05-07 16:30:15.122144,0.782609,0.9,0.837209,0.021099,0.838969,0.666667,0.649123,"[0.69090909, 0.666666666, 0.007513148]","[0.649122807, 0.649122807, 0.007794933]",0.839697
2,XGBClassifier,Delete very low MIscale_pos_weight = 22,2023-05-07 16:30:16.312527,0.860465,0.925,0.891566,0.013564,0.891963,0.711864,0.715447,"[0.677419354, 0.711864406, 0.092764378]","[0.676923076, 0.715447154, 0.096564531]",0.892249
3,Random Forest ClassifierWeighted500,Delete very low MIn_estimators : 36 + Entropy,2023-05-07 16:30:17.497220,0.76087,0.875,0.813953,0.024113,0.815615,0.654545,0.666667,[0.678571428],[0.666666666],0.816359
4,Random Forest ClassifierWeighted22,Delete very low MIn_estimators : 36 + Entropy,2023-05-07 16:30:18.625953,0.755556,0.85,0.8,0.025621,0.801037,0.595745,0.647619,[0.703703703],[0.654867256],0.801648
5,RandomForestClassifierUpperSampling,"Delete very low MIupsampled, n_estimators=36, criterion = entropy",2023-05-07 16:30:21.059364,0.708333,0.85,0.772727,0.030142,0.775535,0.642857,0.62069,"[0.678571428, 0.642857142]","[0.637168141, 0.620689655]",0.776649
6,RandomForestClassifierUnderSampling,"Delete very low MIundersampled, n_estimators=36, criterion = entropy",2023-05-07 16:30:23.314014,0.708333,0.85,0.772727,0.030142,0.775535,0.642857,0.62069,"[0.549019607, 0.642857142]","[0.48076923, 0.620689655]",0.776649


### 2.2 Delete low MI

In [68]:
train_X.drop(verylowMI, inplace=True, axis=1, errors='ignore')
val_X.drop(verylowMI, inplace=True, axis=1, errors='ignore')
test_X.drop(verylowMI, inplace=True, axis=1, errors='ignore')
X_entier.drop(verylowMI, inplace=True, axis=1, errors='ignore')
train_X.drop(lowMI, inplace=True, axis=1, errors='ignore')
val_X.drop(lowMI, inplace=True, axis=1, errors='ignore')
test_X.drop(lowMI, inplace=True, axis=1, errors='ignore')
X_entier.drop(lowMI, inplace=True, axis=1, errors='ignore')

withoutlowMI = comparemodels(train_X, train_y, val_X, test_X, "Delete low MI")
withoutlowMI["PublicScore"] = [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
withoutlowMI["PrivateScore"] = [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
withoutlowMI

done
done
done
done
done
done
done


Unnamed: 0,Model,Description,Date,Precision,Recall,F1-score,LogLoss,Mcc,PublicScore,PrivateScore,OldPublicScore,OldPrivateScore,MeanOurMetrics
0,DecisionTreeClassifier,Delete low MImax_leaf_nodes : 6 + with date columns,2023-05-07 16:30:23.482611,0.5,0.05,0.090909,0.060284,0.15773,,,"[0.666666666, 0.666666666, 0.006575486]","[0.661016949, 0.661016949, 0.006863327]",0.19966
1,Random Forest Classifier,Delete low MIn_estimators : 36 + Entropy,2023-05-07 16:30:24.429001,0.615385,0.4,0.484848,0.051241,0.495474,,,"[0.69090909, 0.666666666, 0.007513148]","[0.649122807, 0.649122807, 0.007794933]",0.498927
2,XGBClassifier,Delete low MIscale_pos_weight = 22,2023-05-07 16:30:25.364763,0.391304,0.675,0.495413,0.08289,0.512908,,,"[0.677419354, 0.711864406, 0.092764378]","[0.676923076, 0.715447154, 0.096564531]",0.518656
3,Random Forest ClassifierWeighted500,Delete low MIn_estimators : 36 + Entropy,2023-05-07 16:30:26.329033,0.62963,0.425,0.507463,0.049734,0.516643,,,[0.678571428],[0.666666666],0.519684
4,Random Forest ClassifierWeighted22,Delete low MIn_estimators : 36 + Entropy,2023-05-07 16:30:27.271725,0.653846,0.425,0.515152,0.048227,0.526526,,,[0.703703703],[0.654867256],0.530131
5,RandomForestClassifierUpperSampling,"Delete low MIupsampled, n_estimators=36, criterion = entropy",2023-05-07 16:30:29.422086,0.358209,0.6,0.448598,0.088919,0.46247,,,"[0.678571428, 0.642857142]","[0.637168141, 0.620689655]",0.467319
6,RandomForestClassifierUnderSampling,"Delete low MIundersampled, n_estimators=36, criterion = entropy",2023-05-07 16:30:31.531002,0.358209,0.6,0.448598,0.088919,0.46247,,,"[0.549019607, 0.642857142]","[0.48076923, 0.620689655]",0.467319


### 2.3 Delete medium MI

In [69]:
train_X.drop(verylowMI, inplace=True, axis=1, errors='ignore')
val_X.drop(verylowMI, inplace=True, axis=1, errors='ignore')
test_X.drop(verylowMI, inplace=True, axis=1, errors='ignore')
X_entier.drop(verylowMI, inplace=True, axis=1, errors='ignore')
train_X.drop(lowMI, inplace=True, axis=1, errors='ignore')
val_X.drop(lowMI, inplace=True, axis=1, errors='ignore')
test_X.drop(lowMI, inplace=True, axis=1, errors='ignore')
X_entier.drop(lowMI, inplace=True, axis=1, errors='ignore')

train_X.drop(mediumMI, inplace=True, axis=1, errors='ignore')
val_X.drop(mediumMI, inplace=True, axis=1, errors='ignore')
test_X.drop(mediumMI, inplace=True, axis=1, errors='ignore')
X_entier.drop(mediumMI, inplace=True, axis=1, errors='ignore')

withoutmediumMI = comparemodels(train_X, train_y, val_X, test_X, "Delete medium MI")
withoutmediumMI["PublicScore"] = [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
withoutmediumMI["PrivateScore"] = [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
withoutmediumMI

done


  metric = tp / (tp+fp)
  precision = tp / (tp+fp)
  metric = tp / (tp+fp)
  precision = tp / (tp+fp)


done
done


  metric = tp / (tp+fp)
  precision = tp / (tp+fp)


done


  metric = tp / (tp+fp)
  precision = tp / (tp+fp)


done
done
done


Unnamed: 0,Model,Description,Date,Precision,Recall,F1-score,LogLoss,Mcc,PublicScore,PrivateScore,OldPublicScore,OldPrivateScore,MeanOurMetrics
0,DecisionTreeClassifier,Delete medium MImax_leaf_nodes : 6 + with date columns,2023-05-07 16:30:31.663343,,0.0,,0.060284,0.0,,,"[0.666666666, 0.666666666, 0.006575486]","[0.661016949, 0.661016949, 0.006863327]",0.0
1,Random Forest Classifier,Delete medium MIn_estimators : 36 + Entropy,2023-05-07 16:30:32.095585,,0.0,,0.060284,0.0,,,"[0.69090909, 0.666666666, 0.007513148]","[0.649122807, 0.649122807, 0.007794933]",0.0
2,XGBClassifier,Delete medium MIscale_pos_weight = 22,2023-05-07 16:30:32.543177,0.152174,0.175,0.162791,0.108511,0.161685,,,"[0.677419354, 0.711864406, 0.092764378]","[0.676923076, 0.715447154, 0.096564531]",0.162913
3,Random Forest ClassifierWeighted500,Delete medium MIn_estimators : 36 + Entropy,2023-05-07 16:30:33.010695,,0.0,,0.060284,0.0,,,[0.678571428],[0.666666666],0.0
4,Random Forest ClassifierWeighted22,Delete medium MIn_estimators : 36 + Entropy,2023-05-07 16:30:33.443986,,0.0,,0.060284,0.0,,,[0.703703703],[0.654867256],0.0
5,RandomForestClassifierUpperSampling,"Delete medium MIupsampled, n_estimators=36, criterion = entropy",2023-05-07 16:30:34.208445,0.010398,0.85,0.020544,4.885998,0.084977,,,"[0.678571428, 0.642857142]","[0.637168141, 0.620689655]",0.24148
6,RandomForestClassifierUnderSampling,"Delete medium MIundersampled, n_estimators=36, criterion = entropy",2023-05-07 16:30:34.949979,0.010398,0.85,0.020544,4.885998,0.084977,,,"[0.549019607, 0.642857142]","[0.48076923, 0.620689655]",0.24148


## 3 Undersampling with K-means

In [70]:
from sklearn.cluster import KMeans 

# K-means fait sur training set entier (et ensuite sur seuelement le train_X)
def_feature = pd.read_csv("input/Xente_Variable_Definitions.csv")
raw_data = pd.read_csv("input/training.csv")
X_test = pd.read_csv("input/test.csv")
sample_submission = pd.read_csv("input/sample_submission.csv")

#attribut initialization
cols_unique_value = [] #Will be droped
for col in raw_data.columns : 
    if len(raw_data[col].unique()) == 1 :
        cols_unique_value.append(col)

medium_cardianlity_cols = ["ProductId"]

#Data transformation
raw_data['TransactionStartTime'] = pd.to_datetime(raw_data['TransactionStartTime'])
X_test['TransactionStartTime'] = pd.to_datetime(X_test['TransactionStartTime'])

#Data cleaning
data = raw_data.copy()
data = data.dropna(axis=0) #Drop observations/rows with missing values
X_test=X_test.dropna(axis=0)
data.drop(cols_unique_value, axis=1, inplace=True)
X_test.drop(cols_unique_value, axis=1, inplace=True)

#Prepare index for the submission
index_val = list(X_test.TransactionId.values.tolist())

#Set the df index to the Transction Id
data = transactioId_to_index(data)
X_test = transactioId_to_index(X_test)

#Adding data
data = adding_date_col(data, 'TransactionStartTime')
X_test = adding_date_col(X_test, 'TransactionStartTime')


# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in data.columns if data[cname].nunique() < 15 and 
                        data[cname].dtype == "object"]
low_cardinality_cols.append('PricingStrategy')

#transform columns BatchId_54 to number
    #transform columns BatchId_54 to number
id_cols = data.filter(like="Id").columns.tolist()
data[id_cols] = data[id_cols].astype(str).apply(lambda x: x.str.replace(x.name + "_", "")).astype(int)
X_test[id_cols] = X_test[id_cols].astype(str).apply(lambda x: x.str.replace(x.name + "_", "")).astype(int)

data['PricingStrategy'] = data['PricingStrategy'].astype('int')
X_test['PricingStrategy'] = X_test['PricingStrategy'].astype('int')
data['Value'] = data['Value'].astype('float')
X_test['Value'] = X_test['Value'].astype('float')

X_test['PricingStrategy'] = X_test['PricingStrategy'].astype('str')
data['PricingStrategy'] = data['PricingStrategy'].astype('str')

#Data splitting
y = data.FraudResult #The target label
X = data.copy()
X.drop(['FraudResult'], axis=1, inplace=True) #Only the features data

OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
OH_cols_X = pd.DataFrame(OH_encoder.fit_transform(X[low_cardinality_cols]))
OH_cols_X.columns = OH_encoder.get_feature_names_out()
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test[low_cardinality_cols]))
OH_cols_test.columns = OH_encoder.get_feature_names_out()
# One-hot encoding removed index; put it back
OH_cols_X.index = X.index
OH_cols_test.index = X_test.index

OH_cols_X[list(OH_cols_X.columns)] = OH_cols_X[list(OH_cols_X.columns)].astype(int)
OH_cols_test[list(OH_cols_test.columns)] = OH_cols_test[list(OH_cols_test.columns)].astype(int)

# Remove categorical columns (will replace with one-hot encoding)
num_X = X.drop(low_cardinality_cols, axis=1)
num_X_test = X_test.drop(low_cardinality_cols, axis=1)

OH_X = pd.concat([num_X, OH_cols_X], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

# Problème de string
OH_X.drop(['TransactionStartTime'], inplace=True, axis=1)
OH_X_test.drop(['TransactionStartTime'], inplace=True, axis=1)
OH_X.columns = OH_X.columns.astype(str)
OH_X_test.columns = OH_X_test.columns.astype(str)

X = OH_X

#####K-Means
X_0 = X[y == 0] # instances avec FraudResult=0
X_1 = X[y == 1] # instances avec FraudResult=1

# Créer des "centres" pour KMeans
n_clusters = 193
kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(X_0)
centers = kmeans.cluster_centers_

# Trouver les indices des instances à garder dans la classe FraudResult=0
labels = kmeans.predict(X_0)
distances = np.sum((X_0 - centers[labels]) ** 2, axis=1)
keep_indices = np.argsort(distances)[:n_clusters]

df_filtré = X_0[X_0.index.isin(list(keep_indices.index))]

# Combiner les instances sous-échantillonnées
X_undersampled = pd.concat([df_filtré, X_1])
y_undersampled = np.concatenate((np.zeros(n_clusters), np.ones(len(X_1))), axis=0)


train_X, val_X, train_y, val_y = train_test_split(X_undersampled, y_undersampled, random_state = 0)


#train_X.drop(verylowMI, inplace=True, axis=1)
#val_X.drop(verylowMI, inplace=True, axis=1)
#test_X.drop(verylowMI, inplace=True, axis=1)
#X_entier.drop(verylowMI, inplace=True, axis=1)

kmeansundersampling = comparemodels(train_X, train_y, val_X, OH_X_test, "Kmeans UnderSampling", unbalanced = False)
kmeansundersampling["PublicScore"] = [0.006575486, 0.007513148, 0.092764378]
kmeansundersampling["PrivateScore"] = [0.006863327, 0.007794933, 0.096564531] ###Cest trop nulle 
kmeansundersampling



done
done
done


Unnamed: 0,Model,Description,Date,Precision,Recall,F1-score,LogLoss,Mcc,PublicScore,PrivateScore,OldPublicScore,OldPrivateScore,MeanOurMetrics
0,DecisionTreeClassifier,Kmeans UnderSamplingmax_leaf_nodes : 6 + with date columns,2023-05-07 16:31:00.117654,0.980392,1.0,0.990099,0.371584,0.979557,0.006575,0.006863,"[0.666666666, 0.666666666, 0.006575486]","[0.661016949, 0.661016949, 0.006863327]",0.987512
1,Random Forest Classifier,Kmeans UnderSamplingn_estimators : 36 + Entropy,2023-05-07 16:31:00.285578,0.979592,0.96,0.969697,1.114752,0.938324,0.007513,0.007795,"[0.69090909, 0.666666666, 0.007513148]","[0.649122807, 0.649122807, 0.007794933]",0.961903
2,XGBClassifier,Kmeans UnderSamplingon balanced data,2023-05-07 16:31:00.404309,0.979592,0.96,0.969697,1.114752,0.938324,0.092764,0.096565,"[0.677419354, 0.711864406, 0.092764378]","[0.676923076, 0.715447154, 0.096564531]",0.961903


## 4. SMOTE (Synthetic Minority Oversampling Technique)

In [71]:
# SMOTE
def_feature = pd.read_csv("input/Xente_Variable_Definitions.csv")
raw_data = pd.read_csv("input/training.csv")
X_test = pd.read_csv("input/test.csv")
sample_submission = pd.read_csv("input/sample_submission.csv")

#attribut initialization
cols_unique_value = [] #Will be droped
for col in raw_data.columns : 
    if len(raw_data[col].unique()) == 1 :
        cols_unique_value.append(col)

medium_cardianlity_cols = ["ProductId"]

#Data transformation
raw_data['TransactionStartTime'] = pd.to_datetime(raw_data['TransactionStartTime'])
X_test['TransactionStartTime'] = pd.to_datetime(X_test['TransactionStartTime'])

#Data cleaning
data = raw_data.copy()
data = data.dropna(axis=0) #Drop observations/rows with missing values
X_test=X_test.dropna(axis=0)
data.drop(cols_unique_value, axis=1, inplace=True)
X_test.drop(cols_unique_value, axis=1, inplace=True)

#Prepare index for the submission
index_val = list(X_test.TransactionId.values.tolist())

#Set the df index to the Transction Id
data = transactioId_to_index(data)
X_test = transactioId_to_index(X_test)

#Adding data
data = adding_date_col(data, 'TransactionStartTime')
X_test = adding_date_col(X_test, 'TransactionStartTime')


# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in data.columns if data[cname].nunique() < 15 and 
                        data[cname].dtype == "object"]
low_cardinality_cols.append('PricingStrategy')

#transform columns BatchId_54 to number
    #transform columns BatchId_54 to number
id_cols = data.filter(like="Id").columns.tolist()
data[id_cols] = data[id_cols].astype(str).apply(lambda x: x.str.replace(x.name + "_", "")).astype(int)
X_test[id_cols] = X_test[id_cols].astype(str).apply(lambda x: x.str.replace(x.name + "_", "")).astype(int)

data['PricingStrategy'] = data['PricingStrategy'].astype('int')
X_test['PricingStrategy'] = X_test['PricingStrategy'].astype('int')
data['Value'] = data['Value'].astype('float')
X_test['Value'] = X_test['Value'].astype('float')

X_test['PricingStrategy'] = X_test['PricingStrategy'].astype('str')
data['PricingStrategy'] = data['PricingStrategy'].astype('str')

#Data splitting
y = data.FraudResult #The target label
X = data.copy()
X.drop(['FraudResult'], axis=1, inplace=True) #Only the features data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

cat_cols = [col for col in X.columns if X[col].dtype == "object"]#liste of obejct columns
cat_cols.append("PricingStrategy")#pcq mm si c'est un chiffre il faut le considérer comme une catégorie


OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(train_X[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(val_X[low_cardinality_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test[low_cardinality_cols]))
OH_cols_train.columns = OH_encoder.get_feature_names_out()
OH_cols_valid.columns = OH_encoder.get_feature_names_out()
OH_cols_test.columns = OH_encoder.get_feature_names_out()

# One-hot encoding removed index; put it back
OH_cols_train.index = train_X.index
OH_cols_valid.index = val_X.index
OH_cols_test.index = X_test.index

OH_cols_train[list(OH_cols_train.columns)] = OH_cols_train[list(OH_cols_train.columns)].astype(int)
OH_cols_valid[list(OH_cols_valid.columns)] = OH_cols_valid[list(OH_cols_valid.columns)].astype(int)
OH_cols_test[list(OH_cols_test.columns)] = OH_cols_test[list(OH_cols_test.columns)].astype(int)

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = train_X.drop(low_cardinality_cols, axis=1)
num_X_valid = val_X.drop(low_cardinality_cols, axis=1)
num_X_test = X_test.drop(low_cardinality_cols, axis=1)

OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

# Problème de string
OH_X_train.drop(['TransactionStartTime'], inplace=True, axis=1)
OH_X_valid.drop(['TransactionStartTime'], inplace=True, axis=1)
OH_X_test.drop(['TransactionStartTime'], inplace=True, axis=1)
OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)
OH_X_test.columns = OH_X_test.columns.astype(str)

train_X = OH_X_train
val_X = OH_X_valid
test_X = OH_X_test
X_entier = pd.concat([train_X, val_X])

sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(train_X, train_y)

mi_scores = make_mi_scores(X_entier, y)

verylowMI = list(mi_scores[mi_scores<0.0001].index)
lowMI = list(mi_scores[(mi_scores>=0.0001)&(mi_scores<0.001)].index)
mediumMI = list(mi_scores[(mi_scores>=0.001)&(mi_scores<0.01)].index)
highMI = list(mi_scores[mi_scores>=0.01].index)


#train_X.drop(verylowMI, inplace=True, axis=1)
#val_X.drop(verylowMI, inplace=True, axis=1)
#test_X.drop(verylowMI, inplace=True, axis=1)
#X_entier.drop(verylowMI, inplace=True, axis=1)

SMOTERes = comparemodels(X_res, y_res, val_X, test_X, "SMOTE", unbalanced = False)
SMOTERes["PublicScore"] = [np.nan, 0.709677419, 0.358974358] #ici voit que le RandomForest est mieux
SMOTERes["PrivateScore"] = [np.nan, 0.686567164, 0.376068376]  
SMOTERes

done
done
done


Unnamed: 0,Model,Description,Date,Precision,Recall,F1-score,LogLoss,Mcc,PublicScore,PrivateScore,OldPublicScore,OldPrivateScore,MeanOurMetrics
0,DecisionTreeClassifier,SMOTEmax_leaf_nodes : 6 + with date columns,2023-05-07 16:31:18.853308,0.113372,0.975,0.203125,0.461171,0.330225,,,"[0.666666666, 0.666666666, 0.006575486]","[0.661016949, 0.661016949, 0.006863327]",0.405431
1,Random Forest Classifier,SMOTEn_estimators : 36 + Entropy,2023-05-07 16:31:22.707671,0.72,0.9,0.8,0.027128,0.804629,0.709677,0.686567,"[0.69090909, 0.666666666, 0.007513148]","[0.649122807, 0.649122807, 0.007794933]",0.806157
2,XGBClassifier,SMOTEon balanced data,2023-05-07 16:31:25.958940,0.804348,0.925,0.860465,0.018085,0.862324,0.358974,0.376068,"[0.677419354, 0.711864406, 0.092764378]","[0.676923076, 0.715447154, 0.096564531]",0.863034


# Compare result

In [77]:
dfResult = pd.concat([referenceresult, withoutverylowMI, withoutlowMI, withoutmediumMI, kmeansundersampling, SMOTERes])
dfResult

Unnamed: 0,Model,Description,Date,Precision,Recall,F1-score,LogLoss,Mcc,PublicScore,PrivateScore,OldPublicScore,OldPrivateScore,MeanOurMetrics
0,DecisionTreeClassifier,The referencemax_leaf_nodes : 6 + with date columns,2023-05-07 16:29:46.815175,0.891892,0.825,0.857143,0.016578,0.857566,0.666667,0.661017,"[0.666666666, 0.666666666, 0.006575486]","[0.661016949, 0.661016949, 0.006863327]",0.8579
1,Random Forest Classifier,The referencen_estimators : 36 + Entropy,2023-05-07 16:29:47.996356,0.813953,0.875,0.843373,0.019592,0.843655,0.690909,0.649123,"[0.69090909, 0.666666666, 0.007513148]","[0.649122807, 0.649122807, 0.007794933]",0.843995
2,XGBClassifier,The referencescale_pos_weight = 22,2023-05-07 16:29:49.487206,0.822222,0.925,0.870588,0.016578,0.871874,0.677419,0.676923,"[0.677419354, 0.711864406, 0.092764378]","[0.676923076, 0.715447154, 0.096564531]",0.872421
3,Random Forest ClassifierWeighted500,The referencen_estimators : 36 + Entropy,2023-05-07 16:29:50.689128,0.804348,0.925,0.860465,0.018085,0.862324,0.678571,0.666667,[0.678571428],[0.666666666],0.863034
4,Random Forest ClassifierWeighted22,The referencen_estimators : 36 + Entropy,2023-05-07 16:29:51.854001,0.804348,0.925,0.860465,0.018085,0.862324,0.703704,0.654867,[0.703703703],[0.654867256],0.863034
5,RandomForestClassifierUpperSampling,"The referenceupsampled, n_estimators=36, criterion = entropy",2023-05-07 16:29:54.212174,0.777778,0.875,0.823529,0.022606,0.824649,0.678571,0.637168,"[0.678571428, 0.642857142]","[0.637168141, 0.620689655]",0.825239
6,RandomForestClassifierUnderSampling,"The referenceundersampled, n_estimators=36, criterion = entropy",2023-05-07 16:29:56.454271,0.777778,0.875,0.823529,0.022606,0.824649,0.54902,0.480769,"[0.549019607, 0.642857142]","[0.48076923, 0.620689655]",0.825239
0,DecisionTreeClassifier,Delete very low MImax_leaf_nodes : 6 + with date columns,2023-05-07 16:30:13.941221,0.942857,0.825,0.88,0.013564,0.88178,0.666667,0.661017,"[0.666666666, 0.666666666, 0.006575486]","[0.661016949, 0.661016949, 0.006863327]",0.882409
1,Random Forest Classifier,Delete very low MIn_estimators : 36 + Entropy,2023-05-07 16:30:15.122144,0.782609,0.9,0.837209,0.021099,0.838969,0.666667,0.649123,"[0.69090909, 0.666666666, 0.007513148]","[0.649122807, 0.649122807, 0.007794933]",0.839697
2,XGBClassifier,Delete very low MIscale_pos_weight = 22,2023-05-07 16:30:16.312527,0.860465,0.925,0.891566,0.013564,0.891963,0.711864,0.715447,"[0.677419354, 0.711864406, 0.092764378]","[0.676923076, 0.715447154, 0.096564531]",0.892249


In [80]:
dfResult.groupby("Model").max()

Unnamed: 0_level_0,Description,Date,Precision,Recall,F1-score,LogLoss,Mcc,PublicScore,PrivateScore,OldPublicScore,OldPrivateScore,MeanOurMetrics
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
DecisionTreeClassifier,The referencemax_leaf_nodes : 6 + with date columns,2023-05-07 16:31:18.853308,0.980392,1.0,0.990099,0.461171,0.979557,0.666667,0.661017,"[0.666666666, 0.666666666, 0.006575486]","[0.661016949, 0.661016949, 0.006863327]",0.987512
Random Forest Classifier,The referencen_estimators : 36 + Entropy,2023-05-07 16:31:22.707671,0.979592,0.96,0.969697,1.114752,0.938324,0.709677,0.686567,"[0.69090909, 0.666666666, 0.007513148]","[0.649122807, 0.649122807, 0.007794933]",0.961903
Random Forest ClassifierWeighted22,The referencen_estimators : 36 + Entropy,2023-05-07 16:30:33.443986,0.804348,0.925,0.860465,0.060284,0.862324,0.703704,0.654867,[0.703703703],[0.654867256],0.863034
Random Forest ClassifierWeighted500,The referencen_estimators : 36 + Entropy,2023-05-07 16:30:33.010695,0.804348,0.925,0.860465,0.060284,0.862324,0.678571,0.666667,[0.678571428],[0.666666666],0.863034
RandomForestClassifierUnderSampling,"The referenceundersampled, n_estimators=36, criterion = entropy",2023-05-07 16:30:34.949979,0.777778,0.875,0.823529,4.885998,0.824649,0.642857,0.62069,"[0.549019607, 0.642857142]","[0.48076923, 0.620689655]",0.825239
RandomForestClassifierUpperSampling,"The referenceupsampled, n_estimators=36, criterion = entropy",2023-05-07 16:30:34.208445,0.777778,0.875,0.823529,4.885998,0.824649,0.678571,0.637168,"[0.678571428, 0.642857142]","[0.637168141, 0.620689655]",0.825239
XGBClassifier,The referencescale_pos_weight = 22,2023-05-07 16:31:25.958940,0.979592,0.96,0.969697,1.114752,0.938324,0.711864,0.715447,"[0.677419354, 0.711864406, 0.092764378]","[0.676923076, 0.715447154, 0.096564531]",0.961903


In [85]:
dfResult[["Description"]].loc[[dfResult['PublicScore'] == dfResult.groupby(["Model"])['PublicScore'].max()]]

ValueError: Can only compare identically-labeled Series objects

# Sandbox

In [73]:
dfresult  = pd.read_csv("output/resultatsTestEverything.csv")
dfresult

Unnamed: 0,Model,Description,Date,Precision,Recall,F1-score,LogLoss,Mcc,PublicScore,PrivateScore,OldPublicScore,OldPrivateScore,MeanOurMetrics
0,Decision Tree Classifier,La basemax_leaf_nodes : 6 + with date columns,2023-05-06 20:37:01.701978,0.891892,0.825,0.857143,0.016578,0.857566,,,[0.666666666],[0.661016949],0.857900
1,Random Forest Classifier,La basen_estimators : 36 + Entropy,2023-05-06 20:37:02.891559,0.800000,0.900,0.847059,0.019592,0.848262,,,[0.69090909],[0.649122807],0.848830
2,XGBClassifier,La basescale_pos_weight = 22,2023-05-06 20:37:04.316049,0.840909,0.925,0.880952,0.015071,0.881747,,,[0.677419354],[0.676923076],0.882152
3,RandomForestClassifierUpperSampling,"La baseupsampled, n_estimators=36, criterion = entropy",2023-05-06 20:37:06.774426,0.765957,0.900,0.827586,0.022606,0.829975,,,[0.678571428],[0.637168141],0.830880
4,RandomForestClassifierUnderSampling,"La baseundersampled, n_estimators=36, criterion = entropy",2023-05-06 20:37:09.025865,0.765957,0.900,0.827586,0.022606,0.829975,,,[0.549019607],[0.48076923],0.830880
...,...,...,...,...,...,...,...,...,...,...,...,...,...
248,Random Forest Classifier,Kmeans UnderSamplingn_estimators : 36 + Entropy,2023-05-07 16:31:00.285578,0.979592,0.960,0.969697,1.114752,0.938324,,,"[0.69090909, 0.666666666, 0.007513148]","[0.649122807, 0.649122807, 0.007794933]",0.961903
249,XGBClassifier,Kmeans UnderSamplingon balanced data,2023-05-07 16:31:00.404309,0.979592,0.960,0.969697,1.114752,0.938324,,,"[0.677419354, 0.711864406, 0.092764378]","[0.676923076, 0.715447154, 0.096564531]",0.961903
250,DecisionTreeClassifier,SMOTEmax_leaf_nodes : 6 + with date columns,2023-05-07 16:31:18.853308,0.113372,0.975,0.203125,0.461171,0.330225,,,"[0.666666666, 0.666666666, 0.006575486]","[0.661016949, 0.661016949, 0.006863327]",0.405431
251,Random Forest Classifier,SMOTEn_estimators : 36 + Entropy,2023-05-07 16:31:22.707671,0.720000,0.900,0.800000,0.027128,0.804629,,,"[0.69090909, 0.666666666, 0.007513148]","[0.649122807, 0.649122807, 0.007794933]",0.806157


In [74]:
print("Les meilleurs résultat")
dfresult.groupby(['Model'])['Precision','Recall','F1-score','LogLoss','MeanOurMetrics'].max()

Les meilleurs résultat


  dfresult.groupby(['Model'])['Precision','Recall','F1-score','LogLoss','MeanOurMetrics'].max()


Unnamed: 0_level_0,Precision,Recall,F1-score,LogLoss,MeanOurMetrics
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Decision Tree Classifier,0.942857,0.825,0.88,0.060284,0.882409
DecisionTreeClassifier,0.980392,1.0,0.990099,0.461171,0.987512
Random Forest Classifier,0.979592,0.96,0.969697,1.114752,0.961903
Random Forest ClassifierWeighted,0.804348,0.925,0.860465,0.018085,0.863034
Random Forest ClassifierWeighted22,0.823529,0.925,0.860465,0.060284,0.863034
Random Forest ClassifierWeighted500,0.804348,0.925,0.860465,0.060284,0.863034
RandomForestClassifierUnderSampling,0.782609,0.9,0.837209,4.923675,0.839697
RandomForestClassifierUpperSampling,0.782609,0.9,0.837209,4.923675,0.839697
XGBClassifier,0.979592,0.96,0.969697,1.114752,0.961903


In [75]:
print("Les pire résultat")
dfresult.groupby(['Model'])['Precision','Recall','F1-score','LogLoss','MeanOurMetrics'].min()

Les pire résultat


  dfresult.groupby(['Model'])['Precision','Recall','F1-score','LogLoss','MeanOurMetrics'].min()


Unnamed: 0_level_0,Precision,Recall,F1-score,LogLoss,MeanOurMetrics
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Decision Tree Classifier,0.5,0.0,0.090909,0.013564,0.0
DecisionTreeClassifier,0.113372,0.0,0.090909,0.013564,0.0
Random Forest Classifier,0.566667,0.0,0.484848,0.019592,0.0
Random Forest ClassifierWeighted,0.804348,0.925,0.860465,0.018085,0.863034
Random Forest ClassifierWeighted22,0.653846,0.0,0.491228,0.018085,0.0
Random Forest ClassifierWeighted500,0.62963,0.0,0.482759,0.018085,0.0
RandomForestClassifierUnderSampling,0.010398,0.4,0.020544,0.021099,0.24148
RandomForestClassifierUpperSampling,0.010398,0.4,0.020544,0.021099,0.24148
XGBClassifier,0.152174,0.175,0.162791,0.013564,0.162913


In [76]:
y_undersampled

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.