<a href="https://colab.research.google.com/github/RobSpiewakowski/Public_repository/blob/main/Kaggle_ML_Project_with_scikit_learn_Titanic_survivors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Rules: Use machine learning to create a model that predicts which passengers survived the Titanic shipwreck.

# 1. Setup an environment and download data:

In [11]:
# Module import:
import pandas as pd
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import RidgeClassifier
from sklearn import pipeline
from sklearn import model_selection
from pprint import pprint

In [12]:
# Installing kaggle and downloading data:
!pip install -q kaggle
from google.colab import files
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle
! chmod 600 ~/.kaggle/kaggle.json
! kaggle competitions download -c titanic

Saving kaggle.json to kaggle (1).json
mkdir: cannot create directory ‘/root/.kaggle’: File exists
test.csv: Skipping, found more recently modified local copy (use --force to force download)
train.csv: Skipping, found more recently modified local copy (use --force to force download)
gender_submission.csv: Skipping, found more recently modified local copy (use --force to force download)


In [13]:
# Downloading datasets:
train_data = pd.read_csv('/content/train.csv')
test_data = pd.read_csv('/content/test.csv')

# Description:
print("Train data:", '\n', train_data.info())
print('-------------------------------------------------------------------------------------')
print("Test data:", '\n', test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
Train data: 
 None
-------------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  

# 2. Data cleaning and simple analysis:

Description of variables:

* Survived: 0 = dead, 1 = survived;
* Pclass: Ticket class;
* Sex: male or female;
* Age: age in years;
* sibsp: # of biblings / spouses aboard the Titanic;
* parch: # of parents / children aboard the Titanic;
* ticket: ticket number;
* fare: Passenger fare;
* cabin: cabin number;
* embarked: port of embarkation: C = Cherbourg, Q = Queenstown, S = Southampton

In [14]:
# Distribution of numerical variables:
num_variables = ["Age", "SibSp", "Parch", "Fare"]

for i in range(len(num_variables)):
  plot = px.histogram(train_data, x = num_variables[i], width = 1000, height = 300, title = f'Distribution of variable: {num_variables[i]}')
  plot.show()
  print('----------------------------------------------------------------------------------------------------------------------------------')

----------------------------------------------------------------------------------------------------------------------------------


----------------------------------------------------------------------------------------------------------------------------------


----------------------------------------------------------------------------------------------------------------------------------


----------------------------------------------------------------------------------------------------------------------------------


In [15]:
# Drop all the passangers with unknown age:
train_data = train_data[train_data['Age'] > 0]
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  714 non-null    int64  
 1   Survived     714 non-null    int64  
 2   Pclass       714 non-null    int64  
 3   Name         714 non-null    object 
 4   Sex          714 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        714 non-null    int64  
 7   Parch        714 non-null    int64  
 8   Ticket       714 non-null    object 
 9   Fare         714 non-null    float64
 10  Cabin        185 non-null    object 
 11  Embarked     712 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 72.5+ KB


In [16]:
# Drop the unimportant rows from both datasets:
train_data = train_data.drop(columns = ['Cabin', 'Name', 'PassengerId', 'Ticket'])
test_data = test_data.drop(columns = ['Cabin', 'Name', 'PassengerId', 'Ticket'])

train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  714 non-null    int64  
 1   Pclass    714 non-null    int64  
 2   Sex       714 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     714 non-null    int64  
 5   Parch     714 non-null    int64  
 6   Fare      714 non-null    float64
 7   Embarked  712 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 50.2+ KB


In [17]:
# Relationship between variables:
kolumny = pd.Index(train_data.columns).tolist()
plot = px.scatter_matrix(train_data, dimensions = kolumny[1:-1], color = 'Survived')
plot.update_layout(autosize = False, width = 1000, height = 1000)
plot.show()

In [18]:
# Correlation between features:
train_data.corr().style.background_gradient(cmap = 'RdBu_r')

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
Survived,1.0,-0.359653,-0.077221,-0.017358,0.093317,0.268189
Pclass,-0.359653,1.0,-0.369226,0.067247,0.025683,-0.554182
Age,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.017358,0.067247,-0.308247,1.0,0.38382,0.138329
Parch,0.093317,0.025683,-0.189119,0.38382,1.0,0.205119
Fare,0.268189,-0.554182,0.096067,0.138329,0.205119,1.0


# 3. Data pre-processing:

In [20]:
for i in range(len(kolumny)):
  plot = px.box(train_data, x = train_data[kolumny[i]], orientation = 'h', width = 800, height = 200)
  plot.show()
  print('--------------------------------------------------------------------------------------------------------')

--------------------------------------------------------------------------------------------------------


--------------------------------------------------------------------------------------------------------


--------------------------------------------------------------------------------------------------------


--------------------------------------------------------------------------------------------------------


--------------------------------------------------------------------------------------------------------


--------------------------------------------------------------------------------------------------------


--------------------------------------------------------------------------------------------------------


--------------------------------------------------------------------------------------------------------


In [21]:
# Create a IQR_outliers() function, to find outliers:
def IQR_outliers(a, b, c):
  Q1, Q3 = np.percentile(a, [b, c])
  IQR = Q3 - Q1

  upper_limit = Q3 + (1.5 * IQR)
  lower_limit = Q1 - (1.5 * IQR)
  print('Upper limit = ', upper_limit, '\n', 'Lower limit = ', lower_limit)

  indexes = np.where((a > upper_limit) | (a < lower_limit))
  b = np.full(shape = a.shape[0], fill_value = 1)
  b[indexes] = -1

  return b

In [23]:
# We will use this function od: 'Age' and 'Fare':
# Age:
train_data['IQR flag'] = IQR_outliers(train_data['Age'], 10, 75)

plot_age = px.scatter(train_data, x = 'Age', y = 'Survived', color = 'IQR flag', width = 800, height = 200)
plot_age.show()

train_data = train_data[train_data['IQR flag'] != -1]

Upper limit =  74.0 
 Lower limit =  -22.0


In [24]:
# Fare:
train_data['IQR flag'] = IQR_outliers(train_data['Fare'], 10, 90)

plot_fare = px.scatter(train_data, x = 'Fare', y = 'Survived', color = 'IQR flag', width = 800, height = 200)
plot_fare.show()

train_data = train_data[train_data['IQR flag'] != -1]

Upper limit =  187.2750000000001 
 Lower limit =  -99.96500000000005


In [25]:
train_data = train_data.drop(columns = ['IQR flag'])
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 695 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  695 non-null    int64  
 1   Pclass    695 non-null    int64  
 2   Sex       695 non-null    object 
 3   Age       695 non-null    float64
 4   SibSp     695 non-null    int64  
 5   Parch     695 non-null    int64  
 6   Fare      695 non-null    float64
 7   Embarked  693 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 48.9+ KB


In [26]:
train_data = pd.get_dummies(train_data, columns = ['Sex', 'Embarked'], prefix_sep = ': ')
train_data

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex: female,Sex: male,Embarked: C,Embarked: Q,Embarked: S
0,0,3,22.0,1,0,7.2500,0,1,0,0,1
1,1,1,38.0,1,0,71.2833,1,0,1,0,0
2,1,3,26.0,0,0,7.9250,1,0,0,0,1
3,1,1,35.0,1,0,53.1000,1,0,0,0,1
4,0,3,35.0,0,0,8.0500,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,39.0,0,5,29.1250,1,0,0,1,0
886,0,2,27.0,0,0,13.0000,0,1,0,0,1
887,1,1,19.0,0,0,30.0000,1,0,0,0,1
889,1,1,26.0,0,0,30.0000,0,1,1,0,0


In [27]:
# Correlation between features, again:
train_data.corr().style.background_gradient(cmap = 'RdBu_r')

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex: female,Sex: male,Embarked: C,Embarked: Q,Embarked: S
Survived,1.0,-0.344468,-0.08641,-0.021992,0.096304,0.299234,0.53066,-0.53066,0.17898,-0.046077,-0.150874
Pclass,-0.344468,1.0,-0.375999,0.083574,0.061212,-0.677547,-0.139789,0.139789,-0.25139,0.128208,0.180143
Age,-0.08641,-0.375999,1.0,-0.307319,-0.202448,0.138846,-0.090898,0.090898,0.037244,-0.021267,-0.033904
SibSp,-0.021992,0.083574,-0.307319,1.0,0.372732,0.192075,0.09853,-0.09853,-0.041877,0.054486,0.016177
Parch,0.096304,0.061212,-0.202448,0.372732,1.0,0.206457,0.25476,-0.25476,-0.018866,-0.00491,0.023124
Fare,0.299234,-0.677547,0.138846,0.192075,0.206457,1.0,0.223598,-0.223598,0.25988,-0.066931,-0.218407
Sex: female,0.53066,-0.139789,-0.090898,0.09853,0.25476,0.223598,1.0,-1.0,0.090536,0.030038,-0.106879
Sex: male,-0.53066,0.139789,0.090898,-0.09853,-0.25476,-0.223598,-1.0,1.0,-0.090536,-0.030038,0.106879
Embarked: C,0.17898,-0.25139,0.037244,-0.041877,-0.018866,0.25988,0.090536,-0.090536,1.0,-0.093128,-0.870093
Embarked: Q,-0.046077,0.128208,-0.021267,0.054486,-0.00491,-0.066931,0.030038,-0.030038,-0.093128,1.0,-0.392211


In order to be able to validate a model, we have to do one more train/test split. We are going to split a train dataset.

In [28]:
# Train/test split of train data:
X = train_data.drop(columns = 'Survived')
y = train_data['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y.values, test_size = 0.1, random_state = 2000)

# Shapes of sets:
print('X_train: ', X_train.shape, ', X_test: ', X_test.shape, '\n', 'y_train: ', y_train.shape, ', y_test', y_test.shape)

X_train:  (625, 10) , X_test:  (70, 10) 
 y_train:  (625,) , y_test (70,)


In [30]:
# Standarization of expository variables:

scaler = StandardScaler()

X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.fit_transform(X_test)

train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 695 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Survived     695 non-null    int64  
 1   Pclass       695 non-null    int64  
 2   Age          695 non-null    float64
 3   SibSp        695 non-null    int64  
 4   Parch        695 non-null    int64  
 5   Fare         695 non-null    float64
 6   Sex: female  695 non-null    uint8  
 7   Sex: male    695 non-null    uint8  
 8   Embarked: C  695 non-null    uint8  
 9   Embarked: Q  695 non-null    uint8  
 10  Embarked: S  695 non-null    uint8  
dtypes: float64(2), int64(4), uint8(5)
memory usage: 41.4 KB


In [31]:
# Test dataset pre_processing:
print(test_data.info())

test_data = pd.get_dummies(test_data.fillna(test_data['Age'].mean()), columns = ['Sex', 'Embarked'], prefix_sep = ': ')
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Sex       418 non-null    object 
 2   Age       332 non-null    float64
 3   SibSp     418 non-null    int64  
 4   Parch     418 non-null    int64  
 5   Fare      417 non-null    float64
 6   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 23.0+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Pclass       418 non-null    int64  
 1   Age          418 non-null    float64
 2   SibSp        418 non-null    int64  
 3   Parch        418 non-null    int64  
 4   Fare         418 non-null    float64
 5   Sex: female  418 non-null    uint8  
 6   Sex: male    418 non-null    ui

# 4. ML Model:

During this part, we will use and validate 5 models of classification:

* LogisticRegression(),
* KNeighboursClassifier(),
* RandomForestClassifier(),
* SGDClassifier(),
* RidgeClassifier().

Our goal is to achieve accuracy score no lower than 80 %.

In [32]:
ML_models = {'Logistic regression': LogisticRegression(),
             'K neighbours': KNeighborsClassifier(),
             'Random forest': RandomForestClassifier(),
             'SGD': SGDClassifier(),
             'Ridge classifier': RidgeClassifier()}

# Prediction with all models with default parameters:
for key in ML_models:
  model = ML_models[key]
  model.fit(X_train_std, y_train)

  y_pred = model.predict(X_test_std)

  print('Used model: ', model)
  acc = accuracy_score(y_test, y_pred)
  print(f'Accuracy score is = {round(acc, 4)}')
  print('-----------------------------------------------------------------------------------------------------------------')

Used model:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
Accuracy score is = 0.8
-----------------------------------------------------------------------------------------------------------------
Used model:  KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
Accuracy score is = 0.7714
-----------------------------------------------------------------------------------------------------------------
Used model:  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                  

Cross-validation of all models:

In [33]:
ML_models = {'Logistic regression': LogisticRegression(),
             'K neighbours': KNeighborsClassifier(),
             'Random forest': RandomForestClassifier(),
             'SGD': SGDClassifier(),
             'Ridge classifier': RidgeClassifier()}

for key in ML_models:
  scaler = StandardScaler()                                   # Standarization of data
  model = ML_models[key]                                      # Model selected
  proces = pipeline.make_pipeline(scaler, model)
  walidacja = model_selection.KFold(n_splits = 10,
                                    shuffle = True,
                                    random_state = 2000)
  
  walidacja_acc = model_selection.cross_val_score(proces,
                                                  X_train,
                                                  y_train,
                                                  cv = walidacja,
                                                  scoring = 'accuracy',
                                                  n_jobs = -1)
  
  print('Validated model: ', model)
  print(f'Accuracy score for this model = ', walidacja_acc.mean())
  print('-----------------------------------------------------------------------------------------------------------------------')

Validated model:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
Accuracy score for this model =  0.7919866871479775
-----------------------------------------------------------------------------------------------------------------------
Validated model:  KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
Accuracy score for this model =  0.8144649257552483
-----------------------------------------------------------------------------------------------------------------------
Validated model:  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                

Optimalization of all models:

In [34]:
# Optimalization of LogisticRegression() model:
model = LogisticRegression()

# Parameter grid:
parameter_grid = [{'dual': [True, False],
                   'l1_ratio': [0.0, 0.25, 0.5, 0.75, 1.0],
                   'max_iter': [10, 50, 100, 500],
                   'multi_class': ['auto', 'ovr', 'multinominal'],
                   'penalty': ['l1', 'l2', 'elasticnet', 'none'],
                   'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                   'tol': [0.0001, 0.001, 0.1]}]

# validation of model:
walidacja = model_selection.KFold(n_splits = 10, shuffle = True, random_state = 2000)

# Testing of parameter grid:
konfiguracja = pipeline.make_pipeline(StandardScaler(),
                                      model_selection.GridSearchCV(model,
                                                                   parameter_grid,
                                                                   cv = walidacja,
                                                                   verbose = 5,
                                                                   scoring = 'accuracy',
                                                                   n_jobs = -1))

# Training a ML_model:
konfiguracja.fit(X_train_std, y_train)

konfig_grid = konfiguracja.named_steps['gridsearchcv']

# Display best params:
konfig_grid.best_params_

Fitting 10 folds for each of 7200 candidates, totalling 72000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 2424 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 8184 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 16248 tasks      | elapsed:   16.4s
[Parallel(n_jobs=-1)]: Done 26616 tasks      | elapsed:   26.5s
[Parallel(n_jobs=-1)]: Done 39288 tasks      | elapsed:   44.8s
[Parallel(n_jobs=-1)]: Done 52896 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 64672 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 72000 out of 72000 | elapsed:  2.5min finished

l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=none)



{'dual': False,
 'l1_ratio': 1.0,
 'max_iter': 10,
 'multi_class': 'ovr',
 'penalty': 'none',
 'solver': 'saga',
 'tol': 0.1}

In [35]:
# Prediction using this model with best params:
best_model = konfig_grid.best_estimator_

y_pred = best_model.predict(X_test_std)

acc = accuracy_score(y_test, y_pred)
print(f'Accuracy score for this prediction: {round(acc, 4)}')

Accuracy score for this prediction: 0.8


KNeighboursClassifier()

In [36]:
# Optimalization of KNeighboursClassifier() model:
model = KNeighborsClassifier()

# Parameter grid:
parameter_grid = [{'weights': ['uniform', 'distance'],
                   'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                   'leaf_size': list(range(100)),
                   'p': [1, 2]}]

# validation of model:
walidacja = model_selection.KFold(n_splits = 10, shuffle = True, random_state = 2000)

# Testing of parameter grid:
konfiguracja = pipeline.make_pipeline(StandardScaler(),
                                      model_selection.GridSearchCV(model,
                                                                   parameter_grid,
                                                                   cv = walidacja,
                                                                   verbose = 5,
                                                                   scoring = 'accuracy',
                                                                   n_jobs = -1))

# Training a ML_model:
konfiguracja.fit(X_train_std, y_train)

konfig_grid = konfiguracja.named_steps['gridsearchcv']

# Display best params:
konfig_grid.best_params_

Fitting 10 folds for each of 1600 candidates, totalling 16000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 892 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 2332 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done 4604 tasks      | elapsed:   16.7s
[Parallel(n_jobs=-1)]: Done 9788 tasks      | elapsed:   35.0s
[Parallel(n_jobs=-1)]: Done 16000 out of 16000 | elapsed:   54.1s finished


{'algorithm': 'auto', 'leaf_size': 1, 'p': 2, 'weights': 'uniform'}

In [37]:
# Prediction using this model with best params:
best_model = konfig_grid.best_estimator_

y_pred = best_model.predict(X_test_std)

acc = accuracy_score(y_test, y_pred)
print(f'Accuracy score for this prediction: {round(acc, 4)}')

Accuracy score for this prediction: 0.7714


RandomForestClassifier()

In [38]:
# Optimalization of RandomForestClassifier() model:
model = RandomForestClassifier()

# Parameter grid:
parameter_grid = [{'n_estimators': [10, 50, 100, 500],
                   'criterion': ['gini', 'entropy'],
                   'min_samples_split': list(range(10)),
                   'max_features': ['auto', 'sqrt', 'log2'],
                   'oob_score': [True, False],
                   'class_weight': ['balanced', 'balanced_subsample']}]

# validation of model:
walidacja = model_selection.KFold(n_splits = 10, shuffle = True, random_state = 2000)

# Testing of parameter grid:
konfiguracja = pipeline.make_pipeline(StandardScaler(),
                                      model_selection.GridSearchCV(model,
                                                                   parameter_grid,
                                                                   cv = walidacja,
                                                                   verbose = 5,
                                                                   scoring = 'accuracy',
                                                                   n_jobs = -1))

# Training a ML_model:
konfiguracja.fit(X_train_std, y_train)

konfig_grid = konfiguracja.named_steps['gridsearchcv']

# Display best params:
konfig_grid.best_params_

Fitting 10 folds for each of 960 candidates, totalling 9600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 188 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done 300 tasks      | elapsed:   38.0s
[Parallel(n_jobs=-1)]: Done 452 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 1048 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 1332 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 1754 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 2196 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done 2716 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done 3194 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done 3882 tasks      | elapsed: 15.1min
[Parallel(n_jobs=-1)]: Done 4586 tasks      | elapsed: 18.0min
[Parallel(n_jobs=-1)]: Done 5368 tasks      | elapsed: 21.5min
[Parallel(n_jobs=-1)]: Done 6162 tasks      | e

{'class_weight': 'balanced',
 'criterion': 'entropy',
 'max_features': 'log2',
 'min_samples_split': 8,
 'n_estimators': 10,
 'oob_score': True}

In [39]:
# Prediction using this model with best params:
best_model = konfig_grid.best_estimator_

y_pred = best_model.predict(X_test_std)

acc = accuracy_score(y_test, y_pred)
print(f'Accuracy score for this prediction: {round(acc, 4)}')

Accuracy score for this prediction: 0.7429


SGDClassifier()

In [40]:
# Optimalization of SGDClassifier() model:
model = SGDClassifier()

# Parameter grid:
parameter_grid = [{'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                   'penalty': ['l2', 'l1', 'elasticnet'],
                   'alpha': [0.0001, 0.001, 0.01],
                   'fit_intercept': [True, False],
                   'max_iter': [10, 100, 1000, 10000]}]

# validation of model:
walidacja = model_selection.KFold(n_splits = 10, shuffle = True, random_state = 2000)

# Testing of parameter grid:
konfiguracja = pipeline.make_pipeline(StandardScaler(),
                                      model_selection.GridSearchCV(model,
                                                                   parameter_grid,
                                                                   cv = walidacja,
                                                                   verbose = 5,
                                                                   scoring = 'accuracy',
                                                                   n_jobs = -1))

# Training a ML_model:
konfiguracja.fit(X_train_std, y_train)

konfig_grid = konfiguracja.named_steps['gridsearchcv']

# Display best params:
konfig_grid.best_params_

Fitting 10 folds for each of 360 candidates, totalling 3600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 1660 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed:   25.5s finished


{'alpha': 0.001,
 'fit_intercept': True,
 'loss': 'log',
 'max_iter': 10000,
 'penalty': 'l1'}

In [41]:
# Prediction using this model with best params:
best_model = konfig_grid.best_estimator_

y_pred = best_model.predict(X_test_std)

acc = accuracy_score(y_test, y_pred)
print(f'Accuracy score for this prediction: {round(acc, 4)}')

Accuracy score for this prediction: 0.7857


RidgeClassifier()

In [42]:
# Optimalization of RidgeClassifier() model:
model = RidgeClassifier()

# Parameter grid:
parameter_grid = [{'normalize': [True, False],
                   'copy_X': [True, False],
                   'alpha': [0.0001, 0.001, 0.01],
                   'fit_intercept': [True, False],
                   'max_iter': [10, 100, 1000, 10000],
                   'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']}]

# validation of model:
walidacja = model_selection.KFold(n_splits = 10, shuffle = True, random_state = 2000)

# Testing of parameter grid:
konfiguracja = pipeline.make_pipeline(StandardScaler(),
                                      model_selection.GridSearchCV(model,
                                                                   parameter_grid,
                                                                   cv = walidacja,
                                                                   verbose = 5,
                                                                   scoring = 'accuracy',
                                                                   n_jobs = -1))

# Training a ML_model:
konfiguracja.fit(X_train_std, y_train)

konfig_grid = konfiguracja.named_steps['gridsearchcv']

# Display best params:
konfig_grid.best_params_

Fitting 10 folds for each of 672 candidates, totalling 6720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 1660 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 4540 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done 6720 out of 6720 | elapsed:   22.0s finished

The max_iter was reached which means the coef_ did not converge



{'alpha': 0.0001,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': 10,
 'normalize': False,
 'solver': 'sag'}

In [43]:
# Prediction using this model with best params:
best_model = konfig_grid.best_estimator_

y_pred = best_model.predict(X_test_std)

acc = accuracy_score(y_test, y_pred)
print(f'Accuracy score for this prediction: {round(acc, 4)}')

Accuracy score for this prediction: 0.7857


# 5. Prediction, using selected model:

In [44]:
# Data splited:
X_train = train_data.drop(columns = 'Survived')
y_train = train_data['Survived'].values
X_pred = test_data

# Shapes of sets:
print('X_train: ', X_train.shape, ', X_predict: ', X_pred.shape, '\n', 'y_train: ', y_train.shape)

X_train:  (695, 10) , X_predict:  (418, 10) 
 y_train:  (695,)


In [45]:
# Standarization od train and test datasets:
scaler = StandardScaler()

X_train_std = scaler.fit_transform(X_train)
X_pred_std = scaler.fit_transform(X_pred)

# Prediction:
model = SGDClassifier(alpha = 0.001, fit_intercept = True, loss = 'log', max_iter = 1000, penalty = 'l1')

model.fit(X_train_std, y_train)

y_predicted = model.predict(X_pred_std)

print('Validated model: ', model)
y_predicted

Validated model:  SGDClassifier(alpha=0.001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,
              n_iter_no_change=5, n_jobs=None, penalty='l1', power_t=0.5,
              random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)


array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,

In [46]:
test_data['Survived'] = y_predicted
test_data

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex: female,Sex: male,Embarked: C,Embarked: Q,Embarked: S,Survived
0,3,34.50000,0,0,7.8292,0,1,0,1,0,0
1,3,47.00000,1,0,7.0000,1,0,0,0,1,0
2,2,62.00000,0,0,9.6875,0,1,0,1,0,0
3,3,27.00000,0,0,8.6625,0,1,0,0,1,0
4,3,22.00000,1,1,12.2875,1,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
413,3,30.27259,0,0,8.0500,0,1,0,0,1,0
414,1,39.00000,0,0,108.9000,1,0,1,0,0,1
415,3,38.50000,0,0,7.2500,0,1,0,0,1,0
416,3,30.27259,0,0,8.0500,0,1,0,0,1,0


In [47]:
# Relationship between variables:
kolumny = pd.Index(test_data.columns).tolist()
plot = px.scatter_matrix(test_data, dimensions = kolumny[1:-1], color = 'Survived')
plot.update_layout(autosize = False, width = 1000, height = 1000)
plot.show()

In [48]:
# Plot: survived(age):
plot = px.scatter(test_data, x = 'Age', color = 'Survived')
plot.show()

In [49]:
answear = test_data.to_csv('Titanic_survivors_testdata.csv')