## Productionisation of Machine Learning Models (Model Serialization)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv("C:/Users/user/Downloads/svm_implementation/data_cleaned.csv")
data.head()

Unnamed: 0,Survived,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
0,0,22.0,7.25,0,0,1,0,1,0,1,...,1,0,0,0,0,0,0,0,0,1
1,1,38.0,71.2833,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0
2,1,26.0,7.925,0,0,1,1,0,1,0,...,1,0,0,0,0,0,0,0,0,1
3,1,35.0,53.1,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,1
4,0,35.0,8.05,0,0,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1


In [3]:
print(data.shape)

(891, 25)


In [4]:
data.columns

Index(['Survived', 'Age', 'Fare', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Sex_female', 'Sex_male', 'SibSp_0', 'SibSp_1', 'SibSp_2', 'SibSp_3',
       'SibSp_4', 'SibSp_5', 'SibSp_8', 'Parch_0', 'Parch_1', 'Parch_2',
       'Parch_3', 'Parch_4', 'Parch_5', 'Parch_6', 'Embarked_C', 'Embarked_Q',
       'Embarked_S'],
      dtype='object')

In [5]:
X = data.drop(['Survived'], axis=1)
y = data['Survived']

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [7]:
# scaling the numerical features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_rescaled = pd.DataFrame(scaler.fit_transform(X_train), 
                                    columns = X_train.columns, 
                                    index = X_train.index)

X_train_rescaled.head()

Unnamed: 0,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,SibSp_2,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
105,-0.143587,-0.483808,-0.568131,-0.510272,0.905629,-0.727052,0.727052,0.677138,-0.545057,-0.188836,...,0.56352,-0.381835,-0.325515,-0.086842,-0.0548,-0.077615,-0.03872,-0.458416,-0.31414,0.5981
68,-0.992755,-0.483231,-0.568131,-0.510272,0.905629,1.375418,-1.375418,-1.476805,-0.545057,-0.188836,...,-1.774561,-0.381835,3.072051,-0.086842,-0.0548,-0.077615,-0.03872,-0.458416,-0.31414,0.5981
253,0.010807,-0.321651,-0.568131,-0.510272,0.905629,-0.727052,0.727052,-1.476805,1.83467,-0.188836,...,0.56352,-0.381835,-0.325515,-0.086842,-0.0548,-0.077615,-0.03872,-0.458416,-0.31414,0.5981
320,-0.60677,-0.496572,-0.568131,-0.510272,0.905629,-0.727052,0.727052,0.677138,-0.545057,-0.188836,...,0.56352,-0.381835,-0.325515,-0.086842,-0.0548,-0.077615,-0.03872,-0.458416,-0.31414,0.5981
706,1.168764,-0.37304,-0.568131,1.95974,-1.104205,1.375418,-1.375418,0.677138,-0.545057,-0.188836,...,0.56352,-0.381835,-0.325515,-0.086842,-0.0548,-0.077615,-0.03872,-0.458416,-0.31414,0.5981


In [8]:
scaler.var_, scaler.scale_

(array([1.67802296e+02, 2.55976386e+03, 1.84470132e-01, 1.63908710e-01,
        2.47559522e-01, 2.26224945e-01, 2.26224945e-01, 2.15541970e-01,
        1.76581717e-01, 3.32456345e-02, 1.47459572e-02, 1.90823443e-02,
        5.95216752e-03, 1.03692316e-02, 1.82928305e-01, 1.11054089e-01,
        8.66291369e-02, 7.42900427e-03, 2.98504787e-03, 5.95216752e-03,
        1.49476496e-03, 1.43497436e-01, 8.17526623e-02, 1.94055004e-01]),
 array([1.29538526e+01, 5.05941089e+01, 4.29499862e-01, 4.04856406e-01,
        4.97553536e-01, 4.75631102e-01, 4.75631102e-01, 4.64264978e-01,
        4.20216274e-01, 1.82333854e-01, 1.21432933e-01, 1.38138859e-01,
        7.71502918e-02, 1.01829424e-01, 4.27701187e-01, 3.33247790e-01,
        2.94328281e-01, 8.61916717e-02, 5.46355916e-02, 7.71502918e-02,
        3.86621903e-02, 3.78810555e-01, 2.85924225e-01, 4.40516746e-01]))

In [9]:
X_test_rescaled = pd.DataFrame(scaler.transform(X_test), 
                                   columns = X_test.columns, 
                                   index = X_test.index)

X_test_rescaled.head()

Unnamed: 0,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,SibSp_2,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
495,-0.01242,-0.354099,-0.568131,-0.510272,0.905629,-0.727052,0.727052,0.677138,-0.545057,-0.188836,...,0.56352,-0.381835,-0.325515,-0.086842,-0.0548,-0.077615,-0.03872,2.181426,-0.31414,-1.671961
648,-0.01242,-0.490643,-0.568131,-0.510272,0.905629,-0.727052,0.727052,0.677138,-0.545057,-0.188836,...,0.56352,-0.381835,-0.325515,-0.086842,-0.0548,-0.077615,-0.03872,-0.458416,-0.31414,0.5981
278,-1.764726,-0.064209,-0.568131,-0.510272,0.905629,-0.727052,0.727052,-1.476805,-0.545057,-0.188836,...,-1.774561,2.618936,-0.325515,-0.086842,-0.0548,-0.077615,-0.03872,-0.458416,3.18329,-1.671961
31,-0.01242,2.256136,1.760159,-0.510272,-1.104205,1.375418,-1.375418,-1.476805,1.83467,-0.188836,...,0.56352,-0.381835,-0.325515,-0.086842,-0.0548,-0.077615,-0.03872,2.181426,-0.31414,-1.671961
255,-0.06639,-0.338534,-0.568131,-0.510272,0.905629,1.375418,-1.375418,0.677138,-0.545057,-0.188836,...,-1.774561,-0.381835,3.072051,-0.086842,-0.0548,-0.077615,-0.03872,2.181426,-0.31414,-1.671961


### Training KNN Classifier

In [10]:
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier()
knn_classifier.fit(X_train_rescaled, y_train)

In [11]:
y_test_pred = knn_classifier.predict(X_test_rescaled)

In [12]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_test_pred)

0.7488789237668162

### Training Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression
lr_classifier = LogisticRegression()
lr_classifier.fit(X_train_rescaled, y_train)

In [14]:
y_test_pred = lr_classifier.predict(X_test_rescaled)

In [15]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_test_pred)

0.8026905829596412

### Training Naive Bayes

In [16]:
from sklearn.naive_bayes import GaussianNB
nb_classifier = GaussianNB()
nb_classifier.fit(X_train_rescaled, y_train)

In [17]:
y_test_pred = nb_classifier.predict(X_test_rescaled)

In [18]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_test_pred)

0.39461883408071746

### Training Decision Tree

In [19]:
from sklearn.tree import DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier(max_depth = 3)
dt_classifier.fit(X_train_rescaled, y_train)

In [20]:
y_test_pred = dt_classifier.predict(X_test_rescaled)

In [21]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_test_pred)

0.8026905829596412

### Training Support Vector Classifier

In [22]:
from sklearn.svm import SVC
sv_classifier = SVC()
sv_classifier.fit(X_train_rescaled, y_train)

In [23]:
y_test_pred = sv_classifier.predict(X_test_rescaled)

In [24]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_test_pred)

0.7892376681614349

### Saving the Model (Serialization)

In [25]:
from pickle import dump

dump(scaler, open('C:/Users/user/Desktop/Internship Data Science 2023/titanic_survival_prediction/models/standard_scaler.pkl', 'wb'))
dump(knn_classifier, open('C:/Users/user/Desktop/Internship Data Science 2023/titanic_survival_prediction/models/knn_model.pkl', 'wb'))
dump(lr_classifier, open('C:/Users/user/Desktop/Internship Data Science 2023/titanic_survival_prediction/models/lr_model.pkl', 'wb'))
dump(nb_classifier, open('C:/Users/user/Desktop/Internship Data Science 2023/titanic_survival_prediction/models/nb_model.pkl', 'wb'))
dump(dt_classifier, open('C:/Users/user/Desktop/Internship Data Science 2023/titanic_survival_prediction/models/dt_model.pkl', 'wb'))
dump(sv_classifier, open('C:/Users/user/Desktop/Internship Data Science 2023/titanic_survival_prediction/models/sv_model.pkl', 'wb'))