# PART II. MODEL BUILDING - SVM

### PREPROCESSING

In [1]:
# Useful libraries

import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, confusion_matrix, precision_score, recall_score, classification_report
from sklearn.svm import SVC

In [2]:
# Loading dataset

dataset = pd.read_csv("Dataset_for_model_building.csv")

In [3]:
# Splitting the dataset in two tables : Y for the target 'class' and X for the explanatory features

target_name = "class"
Y = dataset.loc[:, target_name]
X = dataset.loc[:, [c for c in dataset.columns if c != target_name]]

display(Y.head())
print()
display(X.head())

0    0
1    0
2    1
3    0
4    0
Name: class, dtype: int64




Unnamed: 0,purchase_value,source,browser,sex,age,country,month
0,34,SEO,Chrome,M,39,Japan,4
1,16,Ads,Chrome,F,53,United States,6
2,15,SEO,Opera,M,53,United States,1
3,44,SEO,Safari,M,41,Unknown country,5
4,39,Ads,Safari,M,45,United States,9


In [4]:
# One Hot Encoding for categorical variables of X

X = pd.get_dummies(X)
X.head()

Unnamed: 0,purchase_value,age,month,source_Ads,source_Direct,source_SEO,browser_Chrome,browser_FireFox,browser_IE,browser_Opera,...,country_Tunisia,country_Turkey,country_Ukraine,country_United Arab Emirates,country_United Kingdom,country_United States,country_Unknown country,country_Uruguay,country_Venezuela,country_Viet Nam
0,34,39,4,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,16,53,6,1,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,15,53,1,0,0,1,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
3,44,41,5,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,39,45,9,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


# Transforming X and Y in numpy arrays
X = X.values
Y = Y.tolist()

In [5]:
# Splitting the dataset in Train and Test sets

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42, stratify = Y)

In [6]:
# Standardizing X_train and X_test

sc_x = StandardScaler()
X_train = sc_x.fit_transform(X_train)
X_test = sc_x.transform(X_test)

# Preprocessing

# Creating pipeline for numeric features
numeric_features = [0, 4, 6]
numeric_transformer = Pipeline(steps = [
    ('scaler', StandardScaler())
])

# Creating pipeline for categorical features
categorical_features = [1, 2, 3, 5]
categorical_transformer = Pipeline(steps = [
    ('encoder', OneHotEncoder(drop = 'first'))
])

# Transform

preprocessor = ColumnTransformer(
    transformers = [
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

print("Performing on train set : ")
print(X_train)
X_train = preprocessor.fit_transform(X_train)
print("Done")
print(X_train)

print("Performing on test set : ")
print(X_test)
X_test = preprocessor.transform(X_test)
print("Done")
print(X_test)

### SVM

In [7]:
clf_svc = SVC(class_weight = 'balanced')

In [8]:
# Training a SVM Model

clf_svc.fit(X_train, Y_train)

SVC(class_weight='balanced')

In [9]:
# Predicting

Y_train_pred = clf_svc.predict(X_train)
Y_test_pred = clf_svc.predict(X_test)

In [10]:
# Computing f1_score

print("f1 score on train set is : ", f1_score(Y_train, Y_train_pred))
print("f1 score on test set is : ", f1_score(Y_test, Y_test_pred))

f1 score on train set is :  0.5111816141762748
f1 score on test set is :  0.5046239210850801


In [11]:
# Computing classification scores

print(classification_report(Y_test, Y_test_pred))

              precision    recall  f1-score   support

           0       0.96      0.93      0.94     27171
           1       0.45      0.58      0.50      2816

    accuracy                           0.89     29987
   macro avg       0.70      0.75      0.72     29987
weighted avg       0.91      0.89      0.90     29987



In [12]:
# Visualizing a confusion matrix

print("confusion matric for Train set")
print(confusion_matrix(Y_train, Y_train_pred) )

print("confusion matric for Test set")
print(confusion_matrix(Y_test, Y_test_pred))

confusion matric for Train set
[[100706   7977]
 [  4657   6606]]
confusion matric for Test set
[[25136  2035]
 [ 1179  1637]]


### CONCLUSION

This model doesn't improve prediction