# PART II. MODEL BUILDING - LOGISTIC REGRESSION

### PREPROCESSING

In [27]:
# Useful libraries

import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier

In [28]:
# Loading dataset

dataset = pd.read_csv("Dataset_for_model_building.csv")

In [29]:
# Splitting the dataset in two tables : Y for the target 'class' and X for the explanatory features

target_name = "class"
Y = dataset.loc[:, target_name]
X = dataset.loc[:, [c for c in dataset.columns if c != target_name]]

display(Y.head())
print()
display(X.head())

0    0
1    0
2    1
3    0
4    0
Name: class, dtype: int64




Unnamed: 0,purchase_value,source,browser,sex,age,country,month
0,34,SEO,Chrome,M,39,Japan,4
1,16,Ads,Chrome,F,53,United States,6
2,15,SEO,Opera,M,53,United States,1
3,44,SEO,Safari,M,41,Unknown country,5
4,39,Ads,Safari,M,45,United States,9


In [30]:
# Transforming X and Y in numpy arrays
X = X.values
Y = Y.tolist()

In [31]:
# Splitting the dataset in Train and Test sets

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42, stratify = Y)

In [32]:
# Preprocessing

# Creating pipeline for numeric features
numeric_features = [0, 4, 6]
numeric_transformer = Pipeline(steps = [
    ('scaler', StandardScaler())
])

# Creating pipeline for categorical features
categorical_features = [1, 2, 3, 5]
categorical_transformer = Pipeline(steps = [
    ('encoder', OneHotEncoder(drop = 'first'))
])

In [33]:
# Transform

preprocessor = ColumnTransformer(
    transformers = [
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

print("Performing on train set : ")
print(X_train)
X_train = preprocessor.fit_transform(X_train)
print("Done")
print(X_train)

print("Performing on test set : ")
print(X_test)
X_test = preprocessor.transform(X_test)
print("Done")
print(X_test)

Performing on train set : 
[[29 'SEO' 'Safari' ... 34 'Korea Republic of' 7]
 [11 'SEO' 'IE' ... 29 'Japan' 8]
 [37 'Ads' 'Chrome' ... 22 'United States' 7]
 ...
 [24 'Ads' 'Chrome' ... 22 'United States' 5]
 [33 'SEO' 'Chrome' ... 36 'South Africa' 4]
 [34 'Ads' 'Safari' ... 35 'United States' 2]]
Done
  (0, 0)	-0.4340242293794418
  (0, 1)	0.10106298791367568
  (0, 2)	0.37277377489039915
  (0, 4)	1.0
  (0, 8)	1.0
  (0, 9)	1.0
  (0, 45)	1.0
  (1, 0)	-1.4173142402490027
  (1, 1)	-0.479386214743561
  (1, 2)	0.7486594036934039
  (1, 4)	1.0
  (1, 6)	1.0
  (1, 42)	1.0
  (2, 0)	0.0029935532292519064
  (2, 1)	-1.2920150984636924
  (2, 2)	0.37277377489039915
  (2, 82)	1.0
  (3, 0)	-1.1988053489446557
  (3, 1)	1.494141074291044
  (3, 2)	1.1245450324964088
  (3, 4)	1.0
  (3, 9)	1.0
  (3, 82)	1.0
  (4, 0)	-0.21551533807509493
  (4, 1)	-0.131116693149219
  :	:
  (119941, 1)	-1.1759252579322452
  (119941, 2)	-0.37899748271561046
  (119941, 6)	1.0
  (119941, 9)	1.0
  (119941, 82)	1.0
  (119942, 0)	0

### LOGISTIC REGRESSION

In [40]:
# Train baseline model
baseline_model = LogisticRegression(class_weight = 'balanced', max_iter = 1000) # change value of max_iter to avoid solver warning

print("Training model...")
baseline_model.fit(X_train, Y_train) # Training is always done on train set!!
print("...Done.")

# Predictions on training set
print("Predictions on training set...")
Y_train_pred = baseline_model.predict(X_train)
print("...Done.")
print(Y_train_pred[0:5])
print()

# Predictions on test set
print("Predictions on test set...")
Y_test_pred = baseline_model.predict(X_test)
print("...Done.")
print(Y_test_pred[0:5])
print()

# Print scores
print("f1-score on training set : ", f1_score(Y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(Y_test, Y_test_pred))

Training model...
...Done.
Predictions on training set...
...Done.
[0 0 0 0 1]

Predictions on test set...
...Done.
[1 0 1 0 0]

f1-score on training set :  0.3
f1-score on test set :  0.3034951763403448


In [42]:
# Declaring instance and training - determining the best parameters thanks to GridsearchCV

params = {'C' : [10**i for i in range(-4,4)],
          "penalty" : ["none", "l2"]}

grid_logreg = GridSearchCV(LogisticRegression(class_weight = 'balanced', max_iter = 1000), param_grid = params, cv = 5)
grid_logreg.fit(X_train, Y_train)

print("Hyperparameters that give the best score : ", grid_logreg.best_params_)
print("Best score : ", grid_logreg.best_score_)



Hyperparameters that give the best score :  {'C': 0.0001, 'penalty': 'l2'}
Best score :  0.7148550241286602


In [43]:
# Predicting

Y_train_pred = grid_logreg.predict(X_train)
Y_test_pred = grid_logreg.predict(X_test)

In [44]:
# Computing accuracy score

print("accuracy-score on train set : ", f1_score(Y_train, Y_train_pred))
print("accuracy-score on test set : ", f1_score(Y_test, Y_test_pred))

accuracy-score on train set :  0.30439269166177685
accuracy-score on test set :  0.30647377353915783


In [45]:
# Computing classification scores

print(classification_report(Y_test, Y_test_pred))

              precision    recall  f1-score   support

           0       0.95      0.72      0.82     27171
           1       0.20      0.67      0.31      2816

    accuracy                           0.71     29987
   macro avg       0.58      0.70      0.56     29987
weighted avg       0.88      0.71      0.77     29987



In [46]:
# Visualizing a confusion matrix

print("confusion matric for Train set")
print(confusion_matrix(Y_train, Y_train_pred) )

print("confusion matric for Test set")
print(confusion_matrix(Y_test, Y_test_pred))

confusion matric for Train set
[[78045 30638]
 [ 3741  7522]]
confusion matric for Test set
[[19510  7661]
 [  920  1896]]


### CONCLUSION

This model is not satisfying enough: thanks to the confusion matrix we can see that it doesn't manage to predict accurately enough if a purchase will be fradulent or not. It often predicts false fraud and often doesn't manage to detect true frauds.