Sources : 
https://scikit-learn.org/stable/supervised_learning.html

### Libraries

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

### Data Loading

In [2]:
def_feature = pd.read_csv("input/Xente_Variable_Definitions.csv")
data = pd.read_csv("input/training.csv")
X_test = pd.read_csv("input/test.csv")
sample_submission = pd.read_csv("input/sample_submission.csv")

data = data.dropna(axis=0) #Drop observations/rows with missing values
y = data.FraudResult #The target label
X = data.copy()
X.drop(['FraudResult'], axis=1, inplace=True) #Only the features data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)
cols_cat = list(data.dtypes[data.dtypes=="object"].index) #liste des feature categorical

In [3]:
data.columns

Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId',
       'ProductCategory', 'ChannelId', 'Amount', 'Value',
       'TransactionStartTime', 'PricingStrategy', 'FraudResult'],
      dtype='object')

In [4]:
X.describe()

Unnamed: 0,CountryCode,Amount,Value,PricingStrategy
count,95662.0,95662.0,95662.0,95662.0
mean,256.0,6717.846,9900.584,2.255974
std,0.0,123306.8,123122.1,0.732924
min,256.0,-1000000.0,2.0,0.0
25%,256.0,-50.0,275.0,2.0
50%,256.0,1000.0,1000.0,2.0
75%,256.0,2800.0,5000.0,2.0
max,256.0,9880000.0,9880000.0,4.0


In [5]:
X.describe(include=object, exclude=[np.int64, np.float64])

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,ProviderId,ProductId,ProductCategory,ChannelId,TransactionStartTime
count,95662,95662,95662,95662,95662,95662,95662,95662,95662,95662,95662
unique,95662,94809,3633,3627,3742,1,6,23,9,4,94556
top,TransactionId_76871,BatchId_67019,AccountId_4841,SubscriptionId_3829,CustomerId_7343,UGX,ProviderId_4,ProductId_6,financial_services,ChannelId_3,2018-12-24T16:30:13Z
freq,1,28,30893,32630,4091,95662,38189,32635,45405,56935,17


# Decision Tree Classifier
Very simple model with no feature engineering to have a reference of performance when we improve out model. The missing values are droped for simplicity. At first, the categorical values are also droped. We want to do Binary Classification
Utilise le MAE pour évaluer les performance mais est ce qu'il donne une bonne idée puisque l'output est entre 0 et 1 et donc ne fait que des moyenne de petits nombre < 1 ? 

In [6]:
train_y1 = train_y.copy()
train_X1 = train_X.copy()
val_y1 = val_y.copy()
val_X1 = val_X.copy()
train_X1.drop(cols_cat, axis=1, inplace=True) #Delete categorical data
val_X1.drop(cols_cat, axis=1, inplace=True) #Delete categorical data
#train_X1 = train_X.select_dtypes(exclude=['object'])
#val_X1 = val_X.select_dtypes(exclude=['object'])
train_X1.head()

Unnamed: 0,CountryCode,Amount,Value,PricingStrategy
43878,256,2500.0,2500,2
55031,256,1000.0,1000,2
28483,256,2000.0,2000,2
7328,256,-1000.0,1000,2
18860,256,-5000.0,5000,2


In [17]:
first_model_decision_tree = DecisionTreeClassifier(random_state=1)
first_model_decision_tree.fit(train_X1, train_y1)
predicted_first_model = first_model_decision_tree.predict(val_X1)

df_with_res = val_X1.copy()
df_with_res = df_with_res.join(val_y1.copy())
df_with_res["Predictions"] = list(predicted_first_model)
print(f'Mean Absolute Error : {mean_absolute_error(val_y, predicted_first_model)}')
print(f'Nombre de prédictions correcte : {df_with_res.Predictions[(df_with_res["Predictions"] == df_with_res["FraudResult"] )].count()}')
print(f'Nombre de prédictions incorrecte : {df_with_res.Predictions[(df_with_res["Predictions"] != df_with_res["FraudResult"] )].count()}')
print(f'Nombre de Fraude dans le dataset : {val_y1.value_counts()[1]}')
print(f'Nombre total : {df_with_res.Predictions.count()}')
df_with_res.head(10)

Mean Absolute Error : 0.0004181301220939957
Nombre de prédictions correcte : 23906
Nombre de prédictions incorrecte : 10
Nombre de Fraude dans le dataset : 40
Nombre total : 23916


Unnamed: 0,CountryCode,Amount,Value,PricingStrategy,FraudResult,Predictions
69075,256,2000.0,2000,4,0,0
65709,256,1110.0,1110,2,0,0
5430,256,2000.0,2000,2,0,0
82375,256,-1000.0,1000,2,0,0
60896,256,1000.0,1000,2,0,0
6835,256,-496.0,496,2,0,0
7384,256,1500.0,1500,2,0,0
71953,256,7000.0,7000,2,0,0
5423,256,1000.0,1000,2,0,0
49255,256,1000.0,1000,2,0,0


Ici va expérimenter avec le nombre de noeud maximum et on voit que ça ne change rien. Est-ce que c'est à cause de la Régression ? 

In [8]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeClassifier(max_leaf_nodes=max_leaf_nodes, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X1, val_X1, train_y1, val_y1)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 5  		 Mean Absolute Error:  0
Max leaf nodes: 50  		 Mean Absolute Error:  0
Max leaf nodes: 500  		 Mean Absolute Error:  0
Max leaf nodes: 5000  		 Mean Absolute Error:  0


In [9]:
df_with_res.Predictions[(df_with_res["Predictions"] > 0 )& (df_with_res["Predictions"]<1)].count()

0

## Random Forest Regressor

In [10]:
forest_model = RandomForestClassifier(random_state=1)
forest_model.fit(train_X1, train_y1)
preds = forest_model.predict(val_X1)
print(mean_absolute_error(val_y1, preds))

0.0004181301220939957


In [20]:
model_1 = RandomForestClassifier(n_estimators=50, random_state=0)
model_2 = RandomForestClassifier(n_estimators=100, random_state=0)
model_3 = RandomForestClassifier(n_estimators=100, random_state=0)
model_4 = RandomForestClassifier(n_estimators=200, min_samples_split=20, random_state=0)
model_5 = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5]

# Function for comparing different models
def score_model(model, X_t=train_X1, X_v=val_X1, y_t=train_y1, y_v=val_y1):
    model.fit(X_t, y_t)
    preds = model.predict(X_v)
    return mean_absolute_error(y_v, preds)

for i in range(0, len(models)):
    mae = score_model(models[i])
    print("Model %d MAE: %d" % (i+1, mae))

Model 1 MAE: 0
Model 2 MAE: 0
Model 3 MAE: 0
Model 4 MAE: 0
Model 5 MAE: 0
