# Machine Learning refresher

## Loading the data

In [1]:
import pandas as pd

In [2]:
# Load the data as a DataFrame
df = pd.read_csv('train.csv')

In [3]:
# Display the first 5 rows of the dataset
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Splitting the data

In [4]:
# Import the train_test_split function
from sklearn.model_selection import train_test_split
# Split the data
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Survived']), df['Survived'], test_size=0.2, stratify=df['Survived'], random_state=0)

## Preparing quantitative data

In [5]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [6]:
quanti_columns = ['Pclass', 'Age', 'Fare', 'SibSp', 'Parch']
# Get the quantitative columns
X_train_quanti = X_train[quanti_columns]
X_test_quanti = X_test[quanti_columns]

In [7]:
# Impute missing quantitative values with mean feature value
quanti_imputer = SimpleImputer(strategy='mean')

In [8]:
# Fit and impute the training set
X_train_quanti = quanti_imputer.fit_transform(X_train_quanti)
# Just impute the test set
X_test_quanti = quanti_imputer.transform(X_test_quanti)

In [9]:
# Instantiate the standard scaler
scaler = StandardScaler()

In [10]:
# Fit and transform the training set
X_train_quanti = scaler.fit_transform(X_train_quanti)
# Just transform the test set
X_test_quanti = scaler.transform(X_test_quanti)

In [11]:
 Display the number of missing data for each column
X_train[quanti_columns].isna().sum()#

Pclass      0
Age       146
Fare        0
SibSp       0
Parch       0
dtype: int64

## Preparing qualitative data

In [11]:
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [12]:
quali_columns = ['Sex', 'Embarked']
# Get the quantitative columns
X_train_quali = X_train[quali_columns]
X_test_quali = X_test[quali_columns]

In [13]:
# Impute missing qualitative values with most frequent feature value
quali_imputer = SimpleImputer(strategy='most_frequent')

In [14]:
# Fit and impute the training set
X_train_quali = quali_imputer.fit_transform(X_train_quali)
# Just impute the test set
X_test_quali = quali_imputer.transform(X_test_quali)

In [15]:
# Instantiate the encoder
encoder = OneHotEncoder(drop='first', handle_unknown='ignore')

In [16]:
# Fit and transform the training set
X_train_quali = encoder.fit_transform(X_train_quali).toarray()
# Just encode the test set
X_test_quali = encoder.transform(X_test_quali).toarray()

In [17]:
# Concatenate the data back together
X_train = np.concatenate([X_train_quanti, X_train_quali], axis=1)
X_test = np.concatenate([X_test_quanti, X_test_quali], axis=1)

### There's more

In [18]:
import pickle

pickle.dump((X_train, X_test, y_train, y_test), open('prepared_titanic.pkl', 'wb'))

## Model training

In [19]:
from sklearn.linear_model import LogisticRegression

In [20]:
# Instantiate the model
lr = LogisticRegression()

In [21]:
# Fit on the training data
lr.fit(X_train, y_train)

LogisticRegression()

In [22]:
# Compute and store predictions on the test data
y_pred = lr.predict(X_test)

## Model evaluation

In [23]:
from sklearn.metrics import accuracy_score
# Compute the accuracy on test of our model
print('accuracy on test set:', accuracy_score(y_pred, y_test))

accuracy on test set: 0.7877094972067039


## Hyperparameter optimization

In [24]:
from sklearn.model_selection import GridSearchCV

In [25]:
# Define the hyperparameters we want to test
param_grid = {
    'C': [0.01, 0.03, 0.1]
}

In [26]:
# Instantiate the grid search object
grid = GridSearchCV(
    LogisticRegression(),
    param_grid,
    scoring='accuracy',
    cv=5,
    return_train_score=True
)

In [27]:
# Fit and wait
grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': [0.01, 0.03, 0.1]}, return_train_score=True,
             scoring='accuracy')

In [28]:
y_pred = grid.predict(X_test)

In [29]:
print('Hyperparameter optimized accuracy:', accuracy_score(y_pred, y_test))

Hyperparameter optimized accuracy: 0.7821229050279329
