## KAGGLE LIKE CHALLENGE

In [71]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier, RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from IPython.display import display

import warnings
warnings.filterwarnings('ignore')

# Read file with labels

In [72]:
data = pd.read_csv('conversion_data_train.csv')
print('Set with labels (our train+test) :', data.shape)

Set with labels (our train+test) : (284580, 6)


In [73]:
data.head()

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,China,22,1,Direct,2,0
1,UK,21,1,Ads,3,0
2,Germany,20,0,Seo,14,1
3,US,23,1,Seo,3,0
4,US,28,1,Direct,3,0


# BUILDING THE MODEL

In [74]:
# Choosing relevant variables
features_list = ['country', 'age', 'new_user', 'source', 'total_pages_visited']
numeric_indices = [1,2,4]
categorical_indices = [0,3]
target_variable = 'converted'

In [75]:
X = data.loc[:, features_list]
Y = data.loc[:, target_variable]

print('Variables explicatives : ', X.columns)
print()

Variables explicatives :  Index(['country', 'age', 'new_user', 'source', 'total_pages_visited'], dtype='object')



In [76]:
# Dividing dataset Train set & Test set 
print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=42, stratify = Y)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [77]:
# Converting pandas DataFrames to numpy arrays before using scikit-learn
print("Convert pandas DataFrames to numpy arrays...")
X_train = X_train.values
X_test = X_test.values
Y_train = Y_train.values
Y_test = Y_test.values
print("...Done")

print(X_train[0:5,:])
print(X_test[0:2,:])
print()
print(Y_train[0:5])
print(Y_test[0:2])

Convert pandas DataFrames to numpy arrays...
...Done
[['China' 30 1 'Seo' 8]
 ['Germany' 43 0 'Seo' 5]
 ['US' 41 1 'Direct' 2]
 ['UK' 29 1 'Ads' 19]
 ['US' 49 1 'Seo' 4]]
[['US' 20 1 'Ads' 5]
 ['China' 28 1 'Ads' 2]]

[0 0 0 1 0]
[0 0]


## Features processing

In [78]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, MinMaxScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())
                                     ]
                              )
    
categorical_transformer = Pipeline(steps=[('encoder', OneHotEncoder(drop='first'))
                                         ]
                                  )

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_indices),
        ('cat', categorical_transformer, categorical_indices)
    ])

In [79]:
# Preprocessing
print("Encoding categorical features and standardizing numerical features...")

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

print("...Done")
print(X_train[0:5,:])

Encoding categorical features and standardizing numerical features...
...Done
[[-0.06766859  0.67701862  0.93587807  0.          0.          0.
   0.          1.        ]
 [ 1.50460508 -1.47706425  0.03801145  1.          0.          0.
   0.          1.        ]
 [ 1.26271682  0.67701862 -0.85985517  0.          0.          1.
   1.          0.        ]
 [-0.18861272  0.67701862  4.22805567  0.          1.          0.
   0.          0.        ]
 [ 2.23026985  0.67701862 -0.26127743  0.          0.          1.
   0.          1.        ]]


## MODEL: DECISION TREE

In [81]:
# Determining best hyperparameters

max_depth = [5, 6, 7, 8, 9]
min_samples_split = [2, 5, 10, 20]

dt_grid = GridSearchCV(estimator=DecisionTreeClassifier(), cv=5,
                       param_grid={"max_depth":max_depth, "min_samples_split":min_samples_split},
                      verbose=2)

dt_grid.fit(X_train, Y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] max_depth=5, min_samples_split=2 ................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................. max_depth=5, min_samples_split=2, total=   0.3s
[CV] max_depth=5, min_samples_split=2 ................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV] ................. max_depth=5, min_samples_split=2, total=   0.2s
[CV] max_depth=5, min_samples_split=2 ................................
[CV] ................. max_depth=5, min_samples_split=2, total=   0.2s
[CV] max_depth=5, min_samples_split=2 ................................
[CV] ................. max_depth=5, min_samples_split=2, total=   0.2s
[CV] max_depth=5, min_samples_split=2 ................................
[CV] ................. max_depth=5, min_samples_split=2, total=   0.2s
[CV] max_depth=5, min_samples_split=5 ................................
[CV] ................. max_depth=5, min_samples_split=5, total=   0.2s
[CV] max_depth=5, min_samples_split=5 ................................
[CV] ................. max_depth=5, min_samples_split=5, total=   0.2s
[CV] max_depth=5, min_samples_split=5 ................................
[CV] ................. max_depth=5, min_samples_split=5, total=   0.2s
[CV] max_depth=5, min_samples_split=5 ................................
[CV] .

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   28.4s finished


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [5, 6, 7, 8, 9],
                         'min_samples_split': [2, 5, 10, 20]},
             verbose=2)

In [82]:
print("Hyperparameters that give the best score : ", dt_grid.best_params_)
print("Best score : ", dt_grid.best_score_)

print("...Done.")

Hyperparameters that give the best score :  {'max_depth': 9, 'min_samples_split': 20}
Best score :  0.9852882615038231
...Done.


In [83]:
dt = DecisionTreeClassifier(max_depth=dt_grid.best_params_['max_depth'],
                           min_samples_split=dt_grid.best_params_['min_samples_split'])
dt.fit(X_train, Y_train)

DecisionTreeClassifier(max_depth=9, min_samples_split=20)

In [84]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = dt.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

# Predictions on test set
print("Predictions on test set...")
Y_test_pred = dt.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

Predictions on training set...
...Done.
[0 0 0 ... 0 0 0]

Predictions on test set...
...Done.
[0 0 0 ... 0 0 0]



### PERFORMANCE ASSESSMENT

In [85]:
# The f1-score will be used to assess the performances on the leaderboard
print("f1-score on train set : ", f1_score(Y_train, dt.predict(X_train)))
print("f1-score on test set : ", f1_score(Y_test, dt.predict(X_test)))

f1-score on train set :  0.7671454582237507
f1-score on test set :  0.7569866342648846


In [86]:
# Computing other performance metrics to better understand what the model is doing
from sklearn.metrics import classification_report
print(classification_report(Y_test, dt.predict(X_test)))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     27540
           1       0.86      0.68      0.76       918

    accuracy                           0.99     28458
   macro avg       0.92      0.84      0.87     28458
weighted avg       0.99      0.99      0.99     28458



## MODEL: RANDOM FOREST

In [87]:
# Determining best hyperparameters

max_depth = [4, 5, 6, 7, 8, 9]
min_samples_split = [2, 5, 10, 20]
n_estimators = (10, 20, 30)

rf_grid = GridSearchCV(estimator=RandomForestClassifier(), cv=5,
                       param_grid={"max_depth":max_depth, 
                                   "min_samples_split":min_samples_split,
                                   "n_estimators":n_estimators},
                      verbose=2)

rf_grid.fit(X_train, Y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV] max_depth=4, min_samples_split=2, n_estimators=10 ...............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  max_depth=4, min_samples_split=2, n_estimators=10, total=   0.6s
[CV] max_depth=4, min_samples_split=2, n_estimators=10 ...............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s


[CV]  max_depth=4, min_samples_split=2, n_estimators=10, total=   0.5s
[CV] max_depth=4, min_samples_split=2, n_estimators=10 ...............
[CV]  max_depth=4, min_samples_split=2, n_estimators=10, total=   0.5s
[CV] max_depth=4, min_samples_split=2, n_estimators=10 ...............
[CV]  max_depth=4, min_samples_split=2, n_estimators=10, total=   0.5s
[CV] max_depth=4, min_samples_split=2, n_estimators=10 ...............
[CV]  max_depth=4, min_samples_split=2, n_estimators=10, total=   0.6s
[CV] max_depth=4, min_samples_split=2, n_estimators=20 ...............
[CV]  max_depth=4, min_samples_split=2, n_estimators=20, total=   1.0s
[CV] max_depth=4, min_samples_split=2, n_estimators=20 ...............
[CV]  max_depth=4, min_samples_split=2, n_estimators=20, total=   1.1s
[CV] max_depth=4, min_samples_split=2, n_estimators=20 ...............
[CV]  max_depth=4, min_samples_split=2, n_estimators=20, total=   1.1s
[CV] max_depth=4, min_samples_split=2, n_estimators=20 ...............
[CV]  

[Parallel(n_jobs=1)]: Done 360 out of 360 | elapsed:  8.9min finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [4, 5, 6, 7, 8, 9],
                         'min_samples_split': [2, 5, 10, 20],
                         'n_estimators': (10, 20, 30)},
             verbose=2)

In [88]:
print("Hyperparameters that give the best score : ", rf_grid.best_params_)
print("Best score : ", rf_grid.best_score_)

Hyperparameters that give the best score :  {'max_depth': 9, 'min_samples_split': 20, 'n_estimators': 20}
Best score :  0.9859598166031708


In [89]:
drd = RandomForestClassifier(n_estimators = rf_grid.best_params_['n_estimators'], max_depth=rf_grid.best_params_['max_depth'],
                           min_samples_split=rf_grid.best_params_['min_samples_split'])
drd.fit(X_train, Y_train)

RandomForestClassifier(max_depth=9, min_samples_split=20, n_estimators=20)

In [90]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = drd.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

# Predictions on test set
print("Predictions on test set...")
Y_test_pred = drd.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

Predictions on training set...
...Done.
[0 0 0 ... 0 0 0]

Predictions on test set...
...Done.
[0 0 0 ... 0 0 0]



### PERFORMANCE ASSESSMENT

In [91]:
# The f1-score will be used to assess the performances on the leaderboard
print("f1-score on train set : ", f1_score(Y_train, drd.predict(X_train)))
print("f1-score on test set : ", f1_score(Y_test, drd.predict(X_test)))

f1-score on train set :  0.7656483367277663
f1-score on test set :  0.7501549907005581


In [92]:
# Computing other performance metrics to better understand what the model is doing
print(classification_report(Y_test, drd.predict(X_test)))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     27540
           1       0.87      0.66      0.75       918

    accuracy                           0.99     28458
   macro avg       0.93      0.83      0.87     28458
weighted avg       0.98      0.99      0.98     28458



## TRAINING BEST CLASSIFIER ON ALL DATA & PREDICTING ON  'X_without_labels'

In [23]:
# Concatenating the train and test set to train the best classifier on all data with labels
X = np.append(X_train,X_test,axis=0)
Y = np.append(Y_train,Y_test)

classifier = LogisticRegression(C=grid.best_params_['C'], penalty=grid.best_params_['penalty'])

classifier.fit(X,Y)

LogisticRegression(C=0.0001, penalty='none')

In [24]:
# Reading data without labels
data_without_labels = pd.read_csv('conversion_data_test.csv')
print('Prediction set (without labels) :', data_without_labels.shape)

Prediction set (without labels) : (31620, 5)


In [25]:
X_without_labels = data_without_labels.values

In [26]:
print(X_without_labels[0:5,:])

[['UK' 28 0 'Seo' 16]
 ['UK' 22 1 'Direct' 5]
 ['China' 32 1 'Seo' 1]
 ['US' 32 1 'Ads' 6]
 ['China' 25 0 'Seo' 3]]


In [27]:
# Processing (same processing as for the test set)

preprocessor.transform(X_without_labels)

array([[ 1.        , -0.31023163, -1.47788912, ...,  0.        ,
         0.        ,  1.        ],
       [ 1.        , -1.03484082,  0.67664075, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.        ,  0.17284116,  0.67664075, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 1.        ,  0.29360936,  0.67664075, ...,  0.        ,
         0.        ,  1.        ],
       [ 1.        , -0.67253622,  0.67664075, ...,  0.        ,
         0.        ,  1.        ],
       [ 1.        , -1.03484082,  0.67664075, ...,  1.        ,
         0.        ,  1.        ]])

In [28]:
# Reading data without labels
data_without_labels = pd.read_csv('conversion_data_test.csv')
print('Prediction set (without labels) :', data_without_labels.shape)

# Converting pandas DataFrames to numpy arrays before using scikit-learn
print("Convert pandas DataFrames to numpy arrays...")
X_without_labels = data_without_labels.values
print("...Done")

print(X_without_labels[0:5,:])

Prediction set (without labels) : (31620, 5)
Convert pandas DataFrames to numpy arrays...
...Done
[['UK' 28 0 'Seo' 16]
 ['UK' 22 1 'Direct' 5]
 ['China' 32 1 'Seo' 1]
 ['US' 32 1 'Ads' 6]
 ['China' 25 0 'Seo' 3]]


In [29]:
# Processing (same processing as for the test set)
print("Encoding categorical features and standardizing numerical features...")
X_without_labels = preprocessor.transform(X_without_labels)
print("...Done")
print(X_without_labels[0:5,:])

Encoding categorical features and standardizing numerical features...
...Done
[[ 1.00000000e+00 -3.10231630e-01 -1.47788912e+00  3.32960038e+00
   9.62436641e-02  4.58487952e-01 -1.03294735e+00  2.18415626e+00
  -4.92078019e+00  1.10862387e+01  0.00000000e+00  1.00000000e+00
   0.00000000e+00  0.00000000e+00  1.00000000e+00]
 [ 1.00000000e+00 -1.03484082e+00  6.76640746e-01  3.79362977e-02
   1.07089552e+00 -7.00215464e-01 -3.92580293e-02  4.57842700e-01
   2.56692448e-02  1.43916268e-03  0.00000000e+00  1.00000000e+00
   0.00000000e+00  1.00000000e+00  0.00000000e+00]
 [ 1.00000000e+00  1.72841163e-01  6.76640746e-01 -1.15903246e+00
   2.98740675e-02  1.16951373e-01 -2.00328518e-01  4.57842700e-01
  -7.84248588e-01  1.34335624e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  1.00000000e+00]
 [ 1.00000000e+00  1.72841163e-01  6.76640746e-01  3.37178487e-01
   2.98740675e-02  1.16951373e-01  5.82783217e-02  4.57842700e-01
   2.28148703e-01  1.13689332e-01  0.000000

In [30]:
# Making predictions and dumping to file
data = {
    'converted': classifier.predict(X_without_labels)
}

Y_predictions = pd.DataFrame(columns=['converted'],data=data)
Y_predictions.to_csv('conversion_data_test_predictions_Random_Forest.csv', index=False)
