In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import resample
import datetime as dt
from math import ceil, sqrt
from Utils import *
pd.set_option('max_colwidth', None)
from EvaluationMetric import *
from xgboost import XGBClassifier

In [5]:
def comparemodels(train_X, train_y, val_X, test_X, generaldescription, unbalanced = True):
    res = []

    model1 = DecisionTreeClassifier(max_leaf_nodes= 6, random_state=1)
    model1.fit(train_X, train_y)
    preds_val1 = model1.predict(val_X)
    model_test1 = model1.predict(test_X)
    description = generaldescription+"max_leaf_nodes : 6 + with date columns"
    metrics1 = listmetrics(val_y, preds_val1, "DecisionTreeClassifier", description)
    m1Public = [0.666666666, 0.666666666]
    m1Private = [0.661016949, 0.661016949]
    res.append(metrics1)
    df1 = getscoreforcsv(index_val, model_test1, name_file = "TreeClass5leaf.csv")


    model2 = RandomForestClassifier(n_estimators=36, criterion = 'entropy', random_state=1)
    model2.fit(train_X, train_y)
    preds_val2 = model2.predict(val_X)
    model_test2 = model2.predict(test_X)
    description = generaldescription+"n_estimators : 36 + Entropy"
    metrics2 = listmetrics(val_y, preds_val2, "Random Forest Classifier", description)
    m2Public = [0.690909090, 0.666666666]
    m2Private = [0.649122807, 0.649122807]
    res.append(metrics2)
    df2 = getscoreforcsv(index_val, model_test2, name_file = "RandomForestN36Entropy.csv")


    ##################ICi donne un 0.71 de scrore !!! mtn comprendre pq en diminuant les feature si bon score mais que pour lui !############
    #Ne fonctionne pas bien quadn le training set est entier. 
    model2 = XGBClassifier(scale_pos_weight=22, random_state=1)
    model2.fit(train_X, train_y)
    preds_val2 = model2.predict(val_X)
    model_test2 = model2.predict(test_X)
    description = generaldescription+"scale_pos_weight = 22"
    metrics2 = listmetrics(val_y, preds_val2, "XGBClassifier", description)
    m3Public = [0.677419354, 0.711864406]
    m3Private = [0.676923076,  0.715447154]
    res.append(metrics2)
    df2 = getscoreforcsv(index_val, model_test2, name_file = "XGBClassifierW22.csv")

    #oversamplesimple
    X_con = pd.concat([train_X, train_y], axis=1) 
    not_fraud = X_con[X_con.FraudResult==0]
    fraud = X_con[X_con.FraudResult==1]
    fraud_upsampled = resample(fraud, replace=True, n_samples=len(not_fraud), # match number in majority class
                            random_state=1) # reproducible results
    upsampled = pd.concat([not_fraud, fraud_upsampled])
    train_y_over_sampled = upsampled.FraudResult
    train_X_over_sampled = upsampled.drop('FraudResult', axis=1)
    upsampledmodel = RandomForestClassifier(n_estimators=36, criterion = 'entropy', random_state=1)
    upsampledmodel.fit(train_X_over_sampled, train_y_over_sampled)
    upsampled_pred = upsampledmodel.predict(val_X)
    model_test = upsampledmodel.predict(OH_X_test)
    description = generaldescription+"upsampled, n_estimators=36, criterion = entropy"
    metrics = listmetrics(val_y, upsampled_pred, "RandomForestClassifierUpperSampling", description)
    m4Public = [0.678571428, 0.642857142]
    m4Private = [0.637168141, 0.620689655]
    res.append(metrics)
    df = getscoreforcsv(index_val, model_test, name_file = "RandomForestClassifierUpSample.csv")

    #Undersampling
    not_fraud_downsampled  = resample(not_fraud, replace=False, n_samples=len(fraud), # match number in minority  class
                            random_state=1) # reproducible results
    downsampled  = pd.concat([not_fraud_downsampled , fraud])
    train_y_undersampled = downsampled.FraudResult
    train_X_undersampled = downsampled.drop('FraudResult', axis=1)
    undersampled = RandomForestClassifier(n_estimators=36, criterion = 'entropy', random_state=1)
    undersampled.fit(train_X_over_sampled, train_y_over_sampled)
    undersampled_pred = undersampled.predict(val_X)
    model_test = undersampled.predict(OH_X_test)
    description = generaldescription+"undersampled, n_estimators=36, criterion = entropy"
    metrics = listmetrics(val_y, undersampled_pred, "RandomForestClassifierUnderSampling", description)
    m5Public = [0.549019607, 0.642857142]
    m5Private = [0.480769230, 0.620689655]
    res.append(metrics)
    df = getscoreforcsv(index_val, model_test, name_file = "RandomForestClassifierUndersampled.csv")


    dfres = listmetricsintodf(res)
    dfres["OldPublicScore"] = [m1Public, m2Public, m3Public, m4Public, m5Public]
    dfres["OldPrivateScore"] = [m1Private, m2Private, m3Private, m4Private, m5Private]
    dfres["MeanOurMetrics"] = dfres[['Precision', 'Recall','F1-score','Mcc']].mean(axis=1)
    dfres.to_csv('output/resultatsTestEverything.csv', mode='a',index=False,header = False) 
    return dfres

# Test de different model with different Data Here
## 1. The reference model

In [6]:
def_feature = pd.read_csv("input/Xente_Variable_Definitions.csv")
raw_data = pd.read_csv("input/training.csv")
X_test = pd.read_csv("input/test.csv")
sample_submission = pd.read_csv("input/sample_submission.csv")

#attribut initialization
cols_unique_value = [] #Will be droped
for col in raw_data.columns : 
    if len(raw_data[col].unique()) == 1 :
        cols_unique_value.append(col)

medium_cardianlity_cols = ["ProductId"]

#Data transformation
raw_data['TransactionStartTime'] = pd.to_datetime(raw_data['TransactionStartTime'])
X_test['TransactionStartTime'] = pd.to_datetime(X_test['TransactionStartTime'])

#Data cleaning
data = raw_data.copy()
data = data.dropna(axis=0) #Drop observations/rows with missing values
X_test=X_test.dropna(axis=0)
data.drop(cols_unique_value, axis=1, inplace=True)
X_test.drop(cols_unique_value, axis=1, inplace=True)

#Prepare index for the submission
index_val = list(X_test.TransactionId.values.tolist())

#Set the df index to the Transction Id
data = transactioId_to_index(data)
X_test = transactioId_to_index(X_test)

#Adding data
data = adding_date_col(data, 'TransactionStartTime')
X_test = adding_date_col(X_test, 'TransactionStartTime')


# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in data.columns if data[cname].nunique() < 15 and 
                        data[cname].dtype == "object"]
low_cardinality_cols.append('PricingStrategy')

#transform columns BatchId_54 to number
    #transform columns BatchId_54 to number
id_cols = data.filter(like="Id").columns.tolist()
data[id_cols] = data[id_cols].astype(str).apply(lambda x: x.str.replace(x.name + "_", "")).astype(int)
X_test[id_cols] = X_test[id_cols].astype(str).apply(lambda x: x.str.replace(x.name + "_", "")).astype(int)

data['PricingStrategy'] = data['PricingStrategy'].astype('int')
X_test['PricingStrategy'] = X_test['PricingStrategy'].astype('int')
data['Value'] = data['Value'].astype('float')
X_test['Value'] = X_test['Value'].astype('float')

X_test['PricingStrategy'] = X_test['PricingStrategy'].astype('str')
data['PricingStrategy'] = data['PricingStrategy'].astype('str')

#Data splitting
y = data.FraudResult #The target label
X = data.copy()
X.drop(['FraudResult'], axis=1, inplace=True) #Only the features data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

cat_cols = [col for col in X.columns if X[col].dtype == "object"]#liste of obejct columns
cat_cols.append("PricingStrategy")#pcq mm si c'est un chiffre il faut le considérer comme une catégorie


OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(train_X[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(val_X[low_cardinality_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test[low_cardinality_cols]))
OH_cols_train.columns = OH_encoder.get_feature_names_out()
OH_cols_valid.columns = OH_encoder.get_feature_names_out()
OH_cols_test.columns = OH_encoder.get_feature_names_out()

# One-hot encoding removed index; put it back
OH_cols_train.index = train_X.index
OH_cols_valid.index = val_X.index
OH_cols_test.index = X_test.index

OH_cols_train[list(OH_cols_train.columns)] = OH_cols_train[list(OH_cols_train.columns)].astype(int)
OH_cols_valid[list(OH_cols_valid.columns)] = OH_cols_valid[list(OH_cols_valid.columns)].astype(int)
OH_cols_test[list(OH_cols_test.columns)] = OH_cols_test[list(OH_cols_test.columns)].astype(int)

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = train_X.drop(low_cardinality_cols, axis=1)
num_X_valid = val_X.drop(low_cardinality_cols, axis=1)
num_X_test = X_test.drop(low_cardinality_cols, axis=1)

OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

# Problème de string
OH_X_train.drop(['TransactionStartTime'], inplace=True, axis=1)
OH_X_valid.drop(['TransactionStartTime'], inplace=True, axis=1)
OH_X_test.drop(['TransactionStartTime'], inplace=True, axis=1)
OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)
OH_X_test.columns = OH_X_test.columns.astype(str)

train_X = OH_X_train
val_X = OH_X_valid
test_X = OH_X_test
X_entier = pd.concat([train_X, val_X])

referenceresult = comparemodels(train_X, train_y, val_X, test_X, "The reference")
referenceresult["PublicScore"] = [0.666666666, 0.690909090, 0.677419354, 0.678571428, 0.549019607]
referenceresult["PrivateScore"] = [0.661016949, 0.649122807, 0.676923076, 0.637168141, 0.480769230]
referenceresult

done
done
done
done
done


Unnamed: 0,Model,Description,Date,Precision,Recall,F1-score,LogLoss,Mcc,PublicScore,PrivateScore,OldPublicScore,OldPrivateScore,MeanOurMetrics
0,DecisionTreeClassifier,The referencemax_leaf_nodes : 6 + with date columns,2023-05-07 12:06:12.077756,0.891892,0.825,0.857143,0.016578,0.857566,0.666667,0.661017,"[0.666666666, 0.666666666]","[0.661016949, 0.661016949]",0.8579
1,Random Forest Classifier,The referencen_estimators : 36 + Entropy,2023-05-07 12:06:13.810833,0.813953,0.875,0.843373,0.019592,0.843655,0.690909,0.649123,"[0.69090909, 0.666666666]","[0.649122807, 0.649122807]",0.843995
2,XGBClassifier,The referencescale_pos_weight = 22,2023-05-07 12:06:16.406438,0.822222,0.925,0.870588,0.016578,0.871874,0.677419,0.676923,"[0.677419354, 0.711864406]","[0.676923076, 0.715447154]",0.872421
3,RandomForestClassifierUpperSampling,"The referenceupsampled, n_estimators=36, criterion = entropy",2023-05-07 12:06:20.249096,0.777778,0.875,0.823529,0.022606,0.824649,0.678571,0.637168,"[0.678571428, 0.642857142]","[0.637168141, 0.620689655]",0.825239
4,RandomForestClassifierUnderSampling,"The referenceundersampled, n_estimators=36, criterion = entropy",2023-05-07 12:06:23.944705,0.777778,0.875,0.823529,0.022606,0.824649,0.54902,0.480769,"[0.549019607, 0.642857142]","[0.48076923, 0.620689655]",0.825239


## 2. Play with deleting the low MI columns

In [7]:
# Supprimer les MI trop faible
def_feature = pd.read_csv("input/Xente_Variable_Definitions.csv")
raw_data = pd.read_csv("input/training.csv")
X_test = pd.read_csv("input/test.csv")
sample_submission = pd.read_csv("input/sample_submission.csv")

#attribut initialization
cols_unique_value = [] #Will be droped
for col in raw_data.columns : 
    if len(raw_data[col].unique()) == 1 :
        cols_unique_value.append(col)

medium_cardianlity_cols = ["ProductId"]

#Data transformation
raw_data['TransactionStartTime'] = pd.to_datetime(raw_data['TransactionStartTime'])
X_test['TransactionStartTime'] = pd.to_datetime(X_test['TransactionStartTime'])

#Data cleaning
data = raw_data.copy()
data = data.dropna(axis=0) #Drop observations/rows with missing values
X_test=X_test.dropna(axis=0)
data.drop(cols_unique_value, axis=1, inplace=True)
X_test.drop(cols_unique_value, axis=1, inplace=True)

#Prepare index for the submission
index_val = list(X_test.TransactionId.values.tolist())

#Set the df index to the Transction Id
data = transactioId_to_index(data)
X_test = transactioId_to_index(X_test)

#Adding data
data = adding_date_col(data, 'TransactionStartTime')
X_test = adding_date_col(X_test, 'TransactionStartTime')


# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in data.columns if data[cname].nunique() < 15 and 
                        data[cname].dtype == "object"]
low_cardinality_cols.append('PricingStrategy')

#transform columns BatchId_54 to number
    #transform columns BatchId_54 to number
id_cols = data.filter(like="Id").columns.tolist()
data[id_cols] = data[id_cols].astype(str).apply(lambda x: x.str.replace(x.name + "_", "")).astype(int)
X_test[id_cols] = X_test[id_cols].astype(str).apply(lambda x: x.str.replace(x.name + "_", "")).astype(int)

data['PricingStrategy'] = data['PricingStrategy'].astype('int')
X_test['PricingStrategy'] = X_test['PricingStrategy'].astype('int')
data['Value'] = data['Value'].astype('float')
X_test['Value'] = X_test['Value'].astype('float')

X_test['PricingStrategy'] = X_test['PricingStrategy'].astype('str')
data['PricingStrategy'] = data['PricingStrategy'].astype('str')

#Data splitting
y = data.FraudResult #The target label
X = data.copy()
X.drop(['FraudResult'], axis=1, inplace=True) #Only the features data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

cat_cols = [col for col in X.columns if X[col].dtype == "object"]#liste of obejct columns
cat_cols.append("PricingStrategy")#pcq mm si c'est un chiffre il faut le considérer comme une catégorie


OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(train_X[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(val_X[low_cardinality_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test[low_cardinality_cols]))
OH_cols_train.columns = OH_encoder.get_feature_names_out()
OH_cols_valid.columns = OH_encoder.get_feature_names_out()
OH_cols_test.columns = OH_encoder.get_feature_names_out()

# One-hot encoding removed index; put it back
OH_cols_train.index = train_X.index
OH_cols_valid.index = val_X.index
OH_cols_test.index = X_test.index

OH_cols_train[list(OH_cols_train.columns)] = OH_cols_train[list(OH_cols_train.columns)].astype(int)
OH_cols_valid[list(OH_cols_valid.columns)] = OH_cols_valid[list(OH_cols_valid.columns)].astype(int)
OH_cols_test[list(OH_cols_test.columns)] = OH_cols_test[list(OH_cols_test.columns)].astype(int)

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = train_X.drop(low_cardinality_cols, axis=1)
num_X_valid = val_X.drop(low_cardinality_cols, axis=1)
num_X_test = X_test.drop(low_cardinality_cols, axis=1)

OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

# Problème de string
OH_X_train.drop(['TransactionStartTime'], inplace=True, axis=1)
OH_X_valid.drop(['TransactionStartTime'], inplace=True, axis=1)
OH_X_test.drop(['TransactionStartTime'], inplace=True, axis=1)
OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)
OH_X_test.columns = OH_X_test.columns.astype(str)

train_X = OH_X_train
val_X = OH_X_valid
test_X = OH_X_test
X_entier = pd.concat([train_X, val_X])

mi_scores = make_mi_scores(X_entier , y)

verylowMI = list(mi_scores[mi_scores<0.0001].index)
lowMI = list(mi_scores[(mi_scores>=0.0001)&(mi_scores<0.001)].index)
mediumMI = list(mi_scores[(mi_scores>=0.001)&(mi_scores<0.01)].index)
highMI = list(mi_scores[mi_scores>=0.01].index)

### 2.1 Delecte the very low MI columns

In [8]:
train_X.drop(verylowMI, inplace=True, axis=1, errors='ignore')
val_X.drop(verylowMI, inplace=True, axis=1, errors='ignore')
test_X.drop(verylowMI, inplace=True, axis=1, errors='ignore')
X_entier.drop(verylowMI, inplace=True, axis=1, errors='ignore')

withoutverylowMI = comparemodels(train_X, train_y, val_X, test_X, "Delete very low MI")
withoutverylowMI["PublicScore"] = [0.666666666, 0.666666666, 0.711864406, 0.642857142, 0.642857142]
withoutverylowMI["PrivateScore"] = [0.661016949, 0.649122807, 0.715447154, 0.620689655, 0.620689655]
withoutverylowMI

done
done
done
done
done


Unnamed: 0,Model,Description,Date,Precision,Recall,F1-score,LogLoss,Mcc,PublicScore,PrivateScore,OldPublicScore,OldPrivateScore,MeanOurMetrics
0,DecisionTreeClassifier,Delete very low MImax_leaf_nodes : 6 + with date columns,2023-05-07 12:15:59.362979,0.666667,0.1,0.173913,0.05727,0.257799,0.666667,0.661017,"[0.666666666, 0.666666666]","[0.661016949, 0.661016949]",0.299595
1,Random Forest Classifier,Delete very low MIn_estimators : 36 + Entropy,2023-05-07 12:16:01.466270,0.695652,0.4,0.507937,0.04672,0.52693,0.666667,0.649123,"[0.69090909, 0.666666666]","[0.649122807, 0.649122807]",0.53263
2,XGBClassifier,Delete very low MIscale_pos_weight = 22,2023-05-07 12:16:05.133214,0.466667,0.525,0.494118,0.064805,0.494079,0.711864,0.715447,"[0.677419354, 0.711864406]","[0.676923076, 0.715447154]",0.494966
3,RandomForestClassifierUpperSampling,"Delete very low MIupsampled, n_estimators=36, criterion = entropy",2023-05-07 12:16:10.263521,0.484848,0.4,0.438356,0.061791,0.439537,0.642857,0.62069,"[0.678571428, 0.642857142]","[0.637168141, 0.620689655]",0.440685
4,RandomForestClassifierUnderSampling,"Delete very low MIundersampled, n_estimators=36, criterion = entropy",2023-05-07 12:16:16.094689,0.484848,0.4,0.438356,0.061791,0.439537,0.642857,0.62069,"[0.549019607, 0.642857142]","[0.48076923, 0.620689655]",0.440685


### 2.2 Delete low MI

In [10]:
train_X.drop(verylowMI, inplace=True, axis=1, errors='ignore')
val_X.drop(verylowMI, inplace=True, axis=1, errors='ignore')
test_X.drop(verylowMI, inplace=True, axis=1, errors='ignore')
X_entier.drop(verylowMI, inplace=True, axis=1, errors='ignore')
train_X.drop(lowMI, inplace=True, axis=1, errors='ignore')
val_X.drop(lowMI, inplace=True, axis=1, errors='ignore')
test_X.drop(lowMI, inplace=True, axis=1, errors='ignore')
X_entier.drop(lowMI, inplace=True, axis=1, errors='ignore')

withoutlowMI = comparemodels(train_X, train_y, val_X, test_X, "Delete low MI")
withoutlowMI["PublicScore"] = [np.nan, np.nan, np.nan, np.nan, np.nan]
withoutlowMI["PrivateScore"] = [np.nan, np.nan, np.nan, np.nan, np.nan]
withoutlowMI

done
done
done
done
done


Unnamed: 0,Model,Description,Date,Precision,Recall,F1-score,LogLoss,Mcc,PublicScore,PrivateScore,OldPublicScore,OldPrivateScore,MeanOurMetrics
0,DecisionTreeClassifier,Delete low MImax_leaf_nodes : 6 + with date columns,2023-05-07 12:21:00.573581,0.5,0.05,0.090909,0.060284,0.15773,,,"[0.666666666, 0.666666666]","[0.661016949, 0.661016949]",0.19966
1,Random Forest Classifier,Delete low MIn_estimators : 36 + Entropy,2023-05-07 12:21:01.947710,0.615385,0.4,0.484848,0.051241,0.495474,,,"[0.69090909, 0.666666666]","[0.649122807, 0.649122807]",0.498927
2,XGBClassifier,Delete low MIscale_pos_weight = 22,2023-05-07 12:21:04.540179,0.391304,0.675,0.495413,0.08289,0.512908,,,"[0.677419354, 0.711864406]","[0.676923076, 0.715447154]",0.518656
3,RandomForestClassifierUpperSampling,"Delete low MIupsampled, n_estimators=36, criterion = entropy",2023-05-07 12:21:08.307593,0.358209,0.6,0.448598,0.088919,0.46247,,,"[0.678571428, 0.642857142]","[0.637168141, 0.620689655]",0.467319
4,RandomForestClassifierUnderSampling,"Delete low MIundersampled, n_estimators=36, criterion = entropy",2023-05-07 12:21:12.017581,0.358209,0.6,0.448598,0.088919,0.46247,,,"[0.549019607, 0.642857142]","[0.48076923, 0.620689655]",0.467319


### 2.3 Delete medium MI

In [12]:
train_X.drop(verylowMI, inplace=True, axis=1, errors='ignore')
val_X.drop(verylowMI, inplace=True, axis=1, errors='ignore')
test_X.drop(verylowMI, inplace=True, axis=1, errors='ignore')
X_entier.drop(verylowMI, inplace=True, axis=1, errors='ignore')
train_X.drop(lowMI, inplace=True, axis=1, errors='ignore')
val_X.drop(lowMI, inplace=True, axis=1, errors='ignore')
test_X.drop(lowMI, inplace=True, axis=1, errors='ignore')
X_entier.drop(lowMI, inplace=True, axis=1, errors='ignore')

train_X.drop(mediumMI, inplace=True, axis=1, errors='ignore')
val_X.drop(mediumMI, inplace=True, axis=1, errors='ignore')
test_X.drop(mediumMI, inplace=True, axis=1, errors='ignore')
X_entier.drop(mediumMI, inplace=True, axis=1, errors='ignore')

withoutmediumMI = comparemodels(train_X, train_y, val_X, test_X, "Delete medium MI")
withoutmediumMI["PublicScore"] = [np.nan, np.nan, np.nan, np.nan, np.nan]
withoutmediumMI["PrivateScore"] = [np.nan, np.nan, np.nan, np.nan, np.nan]
withoutmediumMI

  metric = tp / (tp+fp)
  precision = tp / (tp+fp)


done


  metric = tp / (tp+fp)
  precision = tp / (tp+fp)


done
done
done
done


Unnamed: 0,Model,Description,Date,Precision,Recall,F1-score,LogLoss,Mcc,PublicScore,PrivateScore,OldPublicScore,OldPrivateScore,MeanOurMetrics
0,DecisionTreeClassifier,Delete medium MImax_leaf_nodes : 6 + with date columns,2023-05-07 12:22:06.938129,,0.0,,0.060284,0.0,,,"[0.666666666, 0.666666666]","[0.661016949, 0.661016949]",0.0
1,Random Forest Classifier,Delete medium MIn_estimators : 36 + Entropy,2023-05-07 12:22:07.463399,,0.0,,0.060284,0.0,,,"[0.69090909, 0.666666666]","[0.649122807, 0.649122807]",0.0
2,XGBClassifier,Delete medium MIscale_pos_weight = 22,2023-05-07 12:22:08.621075,0.152174,0.175,0.162791,0.108511,0.161685,,,"[0.677419354, 0.711864406]","[0.676923076, 0.715447154]",0.162913
3,RandomForestClassifierUpperSampling,"Delete medium MIupsampled, n_estimators=36, criterion = entropy",2023-05-07 12:22:09.770741,0.010398,0.85,0.020544,4.885998,0.084977,,,"[0.678571428, 0.642857142]","[0.637168141, 0.620689655]",0.24148
4,RandomForestClassifierUnderSampling,"Delete medium MIundersampled, n_estimators=36, criterion = entropy",2023-05-07 12:22:10.889597,0.010398,0.85,0.020544,4.885998,0.084977,,,"[0.549019607, 0.642857142]","[0.48076923, 0.620689655]",0.24148


# Compare result

In [13]:
dfResult = pd.concat([referenceresult, withoutverylowMI, withoutlowMI, withoutmediumMI])
dfResult

Unnamed: 0,Model,Description,Date,Precision,Recall,F1-score,LogLoss,Mcc,PublicScore,PrivateScore,OldPublicScore,OldPrivateScore,MeanOurMetrics
0,DecisionTreeClassifier,The referencemax_leaf_nodes : 6 + with date columns,2023-05-07 12:06:12.077756,0.891892,0.825,0.857143,0.016578,0.857566,0.666667,0.661017,"[0.666666666, 0.666666666]","[0.661016949, 0.661016949]",0.8579
1,Random Forest Classifier,The referencen_estimators : 36 + Entropy,2023-05-07 12:06:13.810833,0.813953,0.875,0.843373,0.019592,0.843655,0.690909,0.649123,"[0.69090909, 0.666666666]","[0.649122807, 0.649122807]",0.843995
2,XGBClassifier,The referencescale_pos_weight = 22,2023-05-07 12:06:16.406438,0.822222,0.925,0.870588,0.016578,0.871874,0.677419,0.676923,"[0.677419354, 0.711864406]","[0.676923076, 0.715447154]",0.872421
3,RandomForestClassifierUpperSampling,"The referenceupsampled, n_estimators=36, criterion = entropy",2023-05-07 12:06:20.249096,0.777778,0.875,0.823529,0.022606,0.824649,0.678571,0.637168,"[0.678571428, 0.642857142]","[0.637168141, 0.620689655]",0.825239
4,RandomForestClassifierUnderSampling,"The referenceundersampled, n_estimators=36, criterion = entropy",2023-05-07 12:06:23.944705,0.777778,0.875,0.823529,0.022606,0.824649,0.54902,0.480769,"[0.549019607, 0.642857142]","[0.48076923, 0.620689655]",0.825239
0,DecisionTreeClassifier,Delete very low MImax_leaf_nodes : 6 + with date columns,2023-05-07 12:15:59.362979,0.666667,0.1,0.173913,0.05727,0.257799,0.666667,0.661017,"[0.666666666, 0.666666666]","[0.661016949, 0.661016949]",0.299595
1,Random Forest Classifier,Delete very low MIn_estimators : 36 + Entropy,2023-05-07 12:16:01.466270,0.695652,0.4,0.507937,0.04672,0.52693,0.666667,0.649123,"[0.69090909, 0.666666666]","[0.649122807, 0.649122807]",0.53263
2,XGBClassifier,Delete very low MIscale_pos_weight = 22,2023-05-07 12:16:05.133214,0.466667,0.525,0.494118,0.064805,0.494079,0.711864,0.715447,"[0.677419354, 0.711864406]","[0.676923076, 0.715447154]",0.494966
3,RandomForestClassifierUpperSampling,"Delete very low MIupsampled, n_estimators=36, criterion = entropy",2023-05-07 12:16:10.263521,0.484848,0.4,0.438356,0.061791,0.439537,0.642857,0.62069,"[0.678571428, 0.642857142]","[0.637168141, 0.620689655]",0.440685
4,RandomForestClassifierUnderSampling,"Delete very low MIundersampled, n_estimators=36, criterion = entropy",2023-05-07 12:16:16.094689,0.484848,0.4,0.438356,0.061791,0.439537,0.642857,0.62069,"[0.549019607, 0.642857142]","[0.48076923, 0.620689655]",0.440685


# Sandbox

In [18]:
dfresult  = pd.read_csv("output/resultatsTestEverything.csv")
dfresult

Unnamed: 0,Model,Description,Date,Precision,Recall,F1-score,LogLoss,Mcc,PublicScore,PrivateScore,OldPublicScore,OldPrivateScore,MeanOurMetrics
0,Decision Tree Classifier,La basemax_leaf_nodes : 6 + with date columns,2023-05-06 20:37:01.701978,0.891892,0.825,0.857143,0.016578,0.857566,,,[0.666666666],[0.661016949],0.8579
1,Random Forest Classifier,La basen_estimators : 36 + Entropy,2023-05-06 20:37:02.891559,0.8,0.9,0.847059,0.019592,0.848262,,,[0.69090909],[0.649122807],0.84883
2,XGBClassifier,La basescale_pos_weight = 22,2023-05-06 20:37:04.316049,0.840909,0.925,0.880952,0.015071,0.881747,,,[0.677419354],[0.676923076],0.882152
3,RandomForestClassifierUpperSampling,"La baseupsampled, n_estimators=36, criterion = entropy",2023-05-06 20:37:06.774426,0.765957,0.9,0.827586,0.022606,0.829975,,,[0.678571428],[0.637168141],0.83088
4,RandomForestClassifierUnderSampling,"La baseundersampled, n_estimators=36, criterion = entropy",2023-05-06 20:37:09.025865,0.765957,0.9,0.827586,0.022606,0.829975,,,[0.549019607],[0.48076923],0.83088
5,Decision Tree Classifier,Sans veryLow MImax_leaf_nodes : 6 + with date columns,2023-05-06 20:51:15.706650,0.942857,0.825,0.88,0.013564,0.88178,,,[0.666666666],[0.661016949],0.882409
6,Random Forest Classifier,Sans veryLow MIn_estimators : 36 + Entropy,2023-05-06 20:51:16.916077,0.755556,0.85,0.8,0.025621,0.801037,,,[0.69090909],[0.649122807],0.801648
7,XGBClassifier,Sans veryLow MIscale_pos_weight = 22,2023-05-06 20:51:18.180099,0.860465,0.925,0.891566,0.013564,0.891963,,,[0.677419354],[0.676923076],0.892249
8,RandomForestClassifierUpperSampling,"Sans veryLow MIupsampled, n_estimators=36, criterion = entropy",2023-05-06 20:51:20.658177,0.782609,0.9,0.837209,0.021099,0.838969,,,[0.678571428],[0.637168141],0.839697
9,RandomForestClassifierUnderSampling,"Sans veryLow MIundersampled, n_estimators=36, criterion = entropy",2023-05-06 20:51:22.937609,0.782609,0.9,0.837209,0.021099,0.838969,,,[0.549019607],[0.48076923],0.839697


In [36]:
print("Les meilleurs résultat")
dfresult.groupby(['Model'])['Precision','Recall','F1-score','LogLoss','MeanOurMetrics'].max()

Les meilleurs résultat


  dfresult.groupby(['Model'])['Precision','Recall','F1-score','LogLoss','MeanOurMetrics'].max()


Unnamed: 0_level_0,Precision,Recall,F1-score,LogLoss,MeanOurMetrics
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Decision Tree Classifier,0.942857,0.825,0.88,0.060284,0.882409
Random Forest Classifier,0.8,0.9,0.847059,0.060284,0.84883
RandomForestClassifierUnderSampling,0.782609,0.9,0.837209,4.923675,0.839697
RandomForestClassifierUpperSampling,0.782609,0.9,0.837209,4.923675,0.839697
XGBClassifier,0.860465,0.925,0.891566,0.108511,0.892249


In [33]:
print("Les pire résultat")
dfresult.groupby(['Model'])['Precision','Recall','F1-score','LogLoss','MeanOurMetrics'].min()

Les pire résultat


  dfresult.groupby(['Model'])['Precision','Recall','F1-score','LogLoss','MeanOurMetrics'].min()


Unnamed: 0_level_0,Precision,Recall,F1-score,LogLoss,MeanOurMetrics
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Decision Tree Classifier,0.5,0.0,0.090909,0.013564,0.0
Random Forest Classifier,0.566667,0.0,0.485714,0.019592,0.0
RandomForestClassifierUnderSampling,0.010616,0.6,0.020977,0.021099,0.248528
RandomForestClassifierUpperSampling,0.010616,0.6,0.020977,0.021099,0.248528
XGBClassifier,0.152174,0.175,0.162791,0.013564,0.162913


## Test cluster Kmeans en plus

In [29]:
from sklearn.cluster import KMeans 

# Supprimer les MI trop faible et K-means fait sur training set entier (et ensuite sur seuelement le train_X)
def_feature = pd.read_csv("input/Xente_Variable_Definitions.csv")
raw_data = pd.read_csv("input/training.csv")
X_test = pd.read_csv("input/test.csv")
sample_submission = pd.read_csv("input/sample_submission.csv")

#attribut initialization
cols_unique_value = [] #Will be droped
for col in raw_data.columns : 
    if len(raw_data[col].unique()) == 1 :
        cols_unique_value.append(col)

medium_cardianlity_cols = ["ProductId"]

#Data transformation
raw_data['TransactionStartTime'] = pd.to_datetime(raw_data['TransactionStartTime'])
X_test['TransactionStartTime'] = pd.to_datetime(X_test['TransactionStartTime'])

#Data cleaning
data = raw_data.copy()
data = data.dropna(axis=0) #Drop observations/rows with missing values
X_test=X_test.dropna(axis=0)
data.drop(cols_unique_value, axis=1, inplace=True)
X_test.drop(cols_unique_value, axis=1, inplace=True)

#Prepare index for the submission
index_val = list(X_test.TransactionId.values.tolist())

#Set the df index to the Transction Id
data = transactioId_to_index(data)
X_test = transactioId_to_index(X_test)

#Adding data
data = adding_date_col(data, 'TransactionStartTime')
X_test = adding_date_col(X_test, 'TransactionStartTime')


# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in data.columns if data[cname].nunique() < 15 and 
                        data[cname].dtype == "object"]
low_cardinality_cols.append('PricingStrategy')

#transform columns BatchId_54 to number

l_col_str = ["BatchId", "AccountId", "SubscriptionId", "CustomerId", "ProviderId", "ProductId", "ChannelId"]
for col in l_col_str:
    data[['dc', 'new_col']] = data[col].str.split("_", expand = True)
    data.drop(['dc',col], inplace=True, axis=1)
    data.rename(columns={"new_col": col}, inplace=True)
    data[col] = data[col].astype('int')
    X_test[['dc', 'new_col']] = X_test[col].str.split("_", expand = True)
    X_test.drop(['dc',col], inplace=True, axis=1)
    X_test.rename(columns={"new_col": col}, inplace=True)
    X_test[col] = X_test[col].astype('int')
data['PricingStrategy'] = data['PricingStrategy'].astype('int')
X_test['PricingStrategy'] = X_test['PricingStrategy'].astype('int')
data['Value'] = data['Value'].astype('float')
X_test['Value'] = X_test['Value'].astype('float')

X_test['PricingStrategy'] = X_test['PricingStrategy'].astype('str')
data['PricingStrategy'] = data['PricingStrategy'].astype('str')

#Data splitting
y = data.FraudResult #The target label
X = data.copy()
X.drop(['FraudResult'], axis=1, inplace=True) #Only the features data

#####K-Means
X_0 = X[y == 0] # instances avec FraudResult=0
X_1 = X[y == 1] # instances avec FraudResult=1

# Créer des "centres" pour KMeans
n_clusters = 193
kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(X_0)
centers = kmeans.cluster_centers_

# Trouver les indices des instances à garder dans la classe FraudResult=0
labels = kmeans.predict(X_0)
distances = np.sum((X_0 - centers[labels]) ** 2, axis=1)
keep_indices = np.argsort(distances)[:n_clusters]

# Combiner les instances sous-échantillonnées
X_undersampled = np.concatenate((X_0[keep_indices], X_1), axis=0)
y_undersampled = np.concatenate((np.zeros(n_clusters), np.ones(len(X_1))), axis=0)


train_X, val_X, train_y, val_y = train_test_split(X_undersampled, y_undersampled, random_state = 0)

cat_cols = [col for col in X.columns if X[col].dtype == "object"]#liste of obejct columns
cat_cols.append("PricingStrategy")#pcq mm si c'est un chiffre il faut le considérer comme une catégorie

#print(describe)


#low_cardinality_cols=["ProviderId", "ProductCategory", "ChannelId", "PricingStrategy"] 
#train_X[low_cardinality_cols] = train_X[low_cardinality_cols].astype(str) 
#val_X[low_cardinality_cols] = val_X[low_cardinality_cols].astype(str) 

OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(train_X[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(val_X[low_cardinality_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test[low_cardinality_cols]))
OH_cols_train.columns = OH_encoder.get_feature_names_out()
OH_cols_valid.columns = OH_encoder.get_feature_names_out()
OH_cols_test.columns = OH_encoder.get_feature_names_out()

# One-hot encoding removed index; put it back
OH_cols_train.index = train_X.index
OH_cols_valid.index = val_X.index
OH_cols_test.index = X_test.index


OH_cols_train[list(OH_cols_train.columns)] = OH_cols_train[list(OH_cols_train.columns)].astype(int)
OH_cols_valid[list(OH_cols_valid.columns)] = OH_cols_valid[list(OH_cols_valid.columns)].astype(int)
OH_cols_test[list(OH_cols_test.columns)] = OH_cols_test[list(OH_cols_test.columns)].astype(int)

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = train_X.drop(low_cardinality_cols, axis=1)
num_X_valid = val_X.drop(low_cardinality_cols, axis=1)
num_X_test = X_test.drop(low_cardinality_cols, axis=1)

OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

# Problème de string
OH_X_train.drop(['TransactionStartTime'], inplace=True, axis=1)
OH_X_valid.drop(['TransactionStartTime'], inplace=True, axis=1)
OH_X_test.drop(['TransactionStartTime'], inplace=True, axis=1)
OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)
OH_X_test.columns = OH_X_test.columns.astype(str)

train_X = OH_X_train
val_X = OH_X_valid
test_X = OH_X_test
X_entier = pd.concat([train_X, val_X])

mi_scores = make_mi_scores(X_entier , y)

verylowMI = list(mi_scores[mi_scores<0.0001].index)

train_X.drop(verylowMI, inplace=True, axis=1)
val_X.drop(verylowMI, inplace=True, axis=1)
test_X.drop(verylowMI, inplace=True, axis=1)
X_entier.drop(verylowMI, inplace=True, axis=1)


res = []

model1 = DecisionTreeClassifier(max_leaf_nodes= 6, random_state=1)
model1.fit(train_X, train_y)
preds_val1 = model1.predict(val_X)
model_test1 = model1.predict(test_X)
description = descriptiontous+"max_leaf_nodes : 6 + with date columns"
metrics1 = listmetrics(val_y, preds_val1, "Decision Tree Classifier", description)
m1Public = [0.666666666]
m1Private = [0.661016949]
res.append(metrics1)
df1 = getscoreforcsv(index_val, model_test1, name_file = "TreeClass5leaf.csv")


model2 = RandomForestClassifier(n_estimators=36, criterion = 'entropy', random_state=1)
model2.fit(train_X, train_y)
preds_val2 = model2.predict(val_X)
model_test2 = model2.predict(test_X)
description = descriptiontous+"n_estimators : 36 + Entropy"
metrics2 = listmetrics(val_y, preds_val2, "Random Forest Classifier", description)
m2Public = [0.690909090]
m2Private = [0.649122807]
res.append(metrics2)
df2 = getscoreforcsv(index_val, model_test2, name_file = "RandomForestN36Entropy.csv")

model2 = XGBClassifier(scale_pos_weight=22, random_state=1)
model2.fit(train_X, train_y)
preds_val2 = model2.predict(val_X)
model_test2 = model2.predict(test_X)
description = descriptiontous+"scale_pos_weight = 22"
metrics2 = listmetrics(val_y, preds_val2, "XGBClassifier", description)
m3Public = [0.677419354]
m3Private = [0.676923076]
res.append(metrics2)
df2 = getscoreforcsv(index_val, model_test2, name_file = "XGBClassifierW22.csv")

#oversamplesimple
X_con = pd.concat([train_X, train_y], axis=1) 
not_fraud = X_con[X_con.FraudResult==0]
fraud = X_con[X_con.FraudResult==1]
fraud_upsampled = resample(fraud, replace=True, n_samples=len(not_fraud), # match number in majority class
                          random_state=1) # reproducible results
upsampled = pd.concat([not_fraud, fraud_upsampled])
train_y_over_sampled = upsampled.FraudResult
train_X_over_sampled = upsampled.drop('FraudResult', axis=1)
upsampledmodel = RandomForestClassifier(n_estimators=36, criterion = 'entropy', random_state=1)
upsampledmodel.fit(train_X_over_sampled, train_y_over_sampled)
upsampled_pred = upsampledmodel.predict(val_X)
model_test = upsampledmodel.predict(OH_X_test)
description = descriptiontous+"upsampled, n_estimators=36, criterion = entropy"
metrics = listmetrics(val_y, upsampled_pred, "RandomForestClassifierUpperSampling", description)
m4Public = [0.678571428]
m4Private = [0.637168141]
res.append(metrics)
df = getscoreforcsv(index_val, model_test, name_file = "RandomForestClassifierUpSample.csv")

#Undersampling
not_fraud_downsampled  = resample(not_fraud, replace=False, n_samples=len(fraud), # match number in minority  class
                          random_state=1) # reproducible results
downsampled  = pd.concat([not_fraud_downsampled , fraud])
train_y_undersampled = downsampled.FraudResult
train_X_undersampled = downsampled.drop('FraudResult', axis=1)
undersampled = RandomForestClassifier(n_estimators=36, criterion = 'entropy', random_state=1)
undersampled.fit(train_X_over_sampled, train_y_over_sampled)
undersampled_pred = undersampled.predict(val_X)
model_test = undersampled.predict(OH_X_test)
description = descriptiontous+"undersampled, n_estimators=36, criterion = entropy"
metrics = listmetrics(val_y, undersampled_pred, "RandomForestClassifierUnderSampling", description)
m5Public = [0.549019607]
m5Private = [0.480769230]
res.append(metrics)
df = getscoreforcsv(index_val, model_test, name_file = "RandomForestClassifierUndersampled.csv")


dfres = listmetricsintodf(res)
dfres["OldPublicScore"] = [m1Public, m2Public, m3Public, m4Public, m5Public]
dfres["OldPrivateScore"] = [m1Private, m2Private, m3Private, m4Private, m5Private]
dfres["MeanOurMetrics"] = dfres[['Precision', 'Recall','F1-score','Mcc']].mean(axis=1)
dfres.to_csv('output/resultatsTestEverything.csv', mode='a',index=False,header = False) 
dfres

ValueError: could not convert string to float: 'airtime'