In [32]:
import numpy as np 
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import KBinsDiscretizer

#1 Load the datasets and creat to dfs TRAIN_data and TEST_data

In [2]:
train = pd.read_csv('/content/train.csv')
train.head(5)
del train['Ticket']

In [3]:
test = pd.read_csv('/content/test.csv')
test.head(5)
del test['Ticket']

In [13]:
train.info() #explore the dataset in order to identify missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 76.7+ KB


In [12]:
test.info() #explore the dataset in order to identify missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Fare         417 non-null    float64
 8   Cabin        91 non-null     object 
 9   Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(4)
memory usage: 32.8+ KB


#1.1 create a new dataframe and extract the passenger ids from the test data

In [5]:
test_data_passengerids = test['PassengerId']

#2 Imputation of missing values in the 'age' and 'embarked' of both datasets

In [16]:
imputer = SimpleImputer(strategy='most_frequent') # for the Column embarked the heatmap showed a missing value, 
#hence the missing value is added on the case of the most frequent value
imputer.fit(train[['Embarked']])        # learn the most frequent value
imputer.transform(train[['Embarked']]) # transform the column
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,30.0708,,C


In [18]:
#the values of the embarked column of the test data set is complete thus one can directly continue with the numerical age column

In [45]:
train_age = train.fillna(train.mean().round(2)) #Moreover the exploration phase showed, that in the age columen different values are missing
#in order to not alter the data to much, the mean age is calculated and attributed to the missing values
train_age.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,,S
5,6,0,3,"Moran, Mr. James",male,29.7,0,0,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,30.0708,,C


In [47]:
test_age = test.fillna(test.mean().round(3)) #Moreover the exploration phase showed, that in the age columen different values are missing
#in order to not alter the data to much, the mean age is calculated and attributed to the missing values
test_age.head(10)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,9.225,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,29.0,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,24.15,,S


#3 Fetch the data columns from the two datasets and create the X,y sets to apply them in the models 

In [22]:
X_train = train_age[['Pclass','Sex','Age','Embarked']] #fetch the numeric data from the df and create a new df as X
y_train = train_age['Survived']

In [25]:
X_pred = test_age[['Pclass','Sex','Age','Embarked']] #fetch the numeric data from the df and create a new df as X

#4 Apply functions to engineer the data - such as SimpleImputer for numerical values and OneHotEncoder for categorical valaues

In [35]:
numeric_features = ["Age"] #numerical column needs to be scaled in order to be applied with the models

In [28]:
numeric_transformer = make_pipeline( #applying the pipeline method makes life easier, yet I need to have a closer look at the column transfer
    SimpleImputer(strategy="mean"), 
    StandardScaler()
    )

In [29]:
categorical_features = ["Embarked", "Sex", "Pclass"]

In [30]:
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

#5 Setup of the preprocessor for the pipeline

In [36]:
preprocessor = ColumnTransformer(
    transformers =[
    ('numerical_features', numeric_transformer, numeric_features),
    ('categorical_features',categorical_transformer, categorical_features),
    ],
    remainder='passthrough')
 

In [37]:
pipeline = make_pipeline(preprocessor, LogisticRegression(max_iter=300))

In [38]:
pipeline_rfm = make_pipeline(preprocessor, RandomForestClassifier(n_estimators=40, max_depth=2))

#5 Split of the train Dataset in order to fit the models

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train,  test_size= 0.25)
X_train, X_test.info, y_train.shape, y_test.shape

(     Pclass     Sex   Age Embarked
 430       1    male  28.0        S
 779       1  female  43.0        S
 7         3    male   2.0        S
 58        2  female   5.0        S
 637       2    male  31.0        S
 ..      ...     ...   ...      ...
 160       3    male  44.0        S
 774       2  female  54.0        S
 218       1  female  32.0        C
 455       3    male  29.0        C
 671       1    male  31.0        S
 
 [668 rows x 4 columns],
 <bound method DataFrame.info of      Pclass     Sex        Age Embarked
 210       3    male  24.000000        S
 112       3    male  22.000000        S
 504       1  female  16.000000        S
 71        3  female  16.000000        S
 773       3    male  29.699118        C
 ..      ...     ...        ...      ...
 349       3    male  42.000000        S
 289       3  female  22.000000        Q
 565       3    male  24.000000        S
 692       3    male  29.699118        S
 562       2    male  28.000000        S
 
 [223 rows x 4 

#6 Fitting of the models

In [None]:
pipeline.fit(X_train, y_train)

In [48]:
pipeline_rfm.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('numerical_features',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer()),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  ['Age']),
                                                 ('categorical_features',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Embarked', 'Sex',
                                                   'Pclass'])])),
                ('randomforestclassifier',
                 RandomForestClassifier(max_depth=2, n_estimators=40))])

# 7 make with the help of the pipeline first predictions on the test data of the train_dataset

In [49]:
print(pipeline_rfm.predict_proba(X_test)[:, 1])

[0.15938613 0.15938613 0.80769852 0.54695208 0.19693716 0.22116368
 0.15938613 0.22116368 0.15938613 0.74373573 0.74373573 0.31017266
 0.81450656 0.32777942 0.77818961 0.32777942 0.15938613 0.32777942
 0.15938613 0.15938613 0.22116368 0.30239677 0.22116368 0.31017266
 0.15938613 0.19693716 0.81450656 0.31017266 0.56527687 0.15938613
 0.31017266 0.77818961 0.31017266 0.15938613 0.22116368 0.22116368
 0.80769852 0.15938613 0.15938613 0.31017266 0.19693716 0.32777942
 0.74373573 0.54695208 0.15938613 0.83904643 0.54695208 0.22116368
 0.77818961 0.15938613 0.15938613 0.15938613 0.15938613 0.15938613
 0.30239677 0.5433881  0.56527687 0.22116368 0.56527687 0.54695208
 0.31017266 0.74373573 0.59824425 0.54695208 0.80769852 0.15938613
 0.54695208 0.80769852 0.15938613 0.80769852 0.16223466 0.19693716
 0.54695208 0.15938613 0.31017266 0.74373573 0.15938613 0.19693716
 0.30239677 0.15938613 0.24061922 0.84585446 0.15938613 0.15938613
 0.38307342 0.31017266 0.22116368 0.5433881  0.31017266 0.8390

In [50]:
print("model score: %.3f" % pipeline_rfm.score(X_test, y_test))

model score: 0.807


In [None]:
#what does this score mean?

In [52]:
pipeline_rfm.score(X_train,y_train) #since the score of the train dataset is lower than on the test-dataset the model sees to perform quite good?

0.7799401197604791

In [53]:
ypred = pipeline_rfm.predict(X_test) 
accuracy_score(y_test,ypred)

0.8071748878923767

In [54]:
ypred

array([0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 0])

# 8 Now it's time to test the model on the real test-dataset

In [55]:
ypred = pipeline_rfm.predict(X_pred)

In [56]:
ypred

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

# 9 creat a dataset for submission of the models predictions on kaggle.com


In [57]:
# creat a dataframe with two columns in the shape as it was defined by kaggle.com
submission = pd.DataFrame(columns=['PassengerId'], data=test_data_passengerids)
submission = pd.concat([submission, pd.DataFrame(ypred, columns=['Survived'])], axis=1)
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [58]:
submission.to_csv("kaggle_submission.csv", index=False) #export a csv-file