In [1]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import pandas_profiling

In [2]:
# load data
df = pd.read_csv('train.csv', index_col=0)
df

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
#pandas_profiling.ProfileReport(df)

In [4]:
sns.pairplot(df, hue='Survived')

#why 4d, and what do the graphs of both axis same ones mean

<seaborn.axisgrid.PairGrid at 0x7ffdc8c66520>

In [5]:
# split to X, and y
X = df.iloc[:,1:]
y = df['Survived']

#Why iloc[:,1:]?

In [6]:
#missing value check
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 115.8+ KB


In [7]:
df['Embarked'].describe()

count     889
unique      3
top         S
freq      644
Name: Embarked, dtype: object

In [8]:
df['Cabin'].describe()

count         204
unique        147
top       B96 B98
freq            4
Name: Cabin, dtype: object

In [9]:
# feature engineering for numericals
numeric_features = ['Age', 'Fare']

In [10]:
# create a sequential pipeline 
# output of one step will be input to the next 
numeric_transformer = make_pipeline(
    SimpleImputer(strategy="mean"), 
    StandardScaler()
    )

#Need to deep dive

In [11]:
# feature engineering for categorical
categorical_features = ["Embarked", "Sex", "Pclass"]

In [12]:
# handle unknown means it will ignore nan's if it finds them
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

#If train data has all the possible categories, even if test data doesn' have all it's no problem. Just a column which has only 0 is created.
#If train data doesn't have all the possible categories but the test data has the missing one, the data is ignored and the column is not created as train data isn't

#pd.get_dummies only creates columns which are existing cateogories in the test data(even if train data has other categories) 
#Not sure what would happen if train data has more columns than test data after pd.get_dummies and vice versa

In [13]:
# you can also create custom functions
def name_length(df):
    length = df[df.columns[0]].str.len()
    return length.values.reshape(-1, 1)

In [14]:
# pre-process transformations
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
        #('name', FunctionTransformer(name_length), ['Name'])
    ],
    remainder='drop') 

#or passthrough. default 'drop'
#Why FunctionTransformer() has name_lengh def. How it works?

In [15]:
# create the model pipeline
pipeline = make_pipeline(preprocessor, LogisticRegression(max_iter=300))

In [16]:
# split data to train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state= 42)

#random_state=42 means when I debug code, every time I get exactly same split between train data and test data
#Could be any number but same number is same split everytime every people

In [17]:
# fit the pipeline to training data
pipeline.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer()),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  ['Age', 'Fare']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Embarked', 'Sex',
                                                   'Pclass'])])),
                ('logisticregression', LogisticRegression(max_iter=300))])

In [18]:
# calculate the accuracy score from test data
print("model score: %.3f" % pipeline.score(X_test, y_test))

#e.g. print('%.3f' % 3.14159) prints 3.142

model score: 0.787


In [19]:
# get predictions from the pipeline
print(pipeline.predict(X_test))

[0 0 0 1 1 1 1 0 1 1 0 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 1 1 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 1 0 1 1 1 0 1 1 0 0 1 0 0 0 1 1 1 1 1
 0 0 1 1 1 0 0 1 1 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1
 0 1 0 0 0 0 0 1 0 0 1 1 0 0 0 1 1 1 0 1 0 0 1 0 1 1 0 0 1 0 1 0 0 1 1 0 0
 1 0 0 0 0 1 0 0 0 1 1 1 0 0 0 1 0 0 0 1 0 0 1 1 0 1 0 0 0 1 1 0 0 0 0 1 1
 0 0 0 0 1 0 0 0 0 1 1 1 0 1 1 0 0 1 0 0 1 0 0 0 0 1 0 1 0 1 0 1 0 1 0 0 1
 0 0 0 1 0 1 1 1 0 1 0 1 0 1 1 1 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0
 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 1 0 1 0 0 0 0 1 1 0 1 1 0 0 0 0
 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0]


In [20]:
# get prediction probabilities from the pipeline 
print(pipeline.predict_proba(X_test)[:, 1])

[0.15272219 0.25514263 0.08916494 0.88642452 0.75945853 0.86780248
 0.66954122 0.09822824 0.73151455 0.87499229 0.28131519 0.0542283
 0.50727335 0.14888294 0.23532435 0.88449127 0.25915029 0.66957186
 0.27332686 0.23815041 0.08217803 0.28547248 0.55665993 0.08934971
 0.07450952 0.1124484  0.43917038 0.25597423 0.12650876 0.49001281
 0.09098827 0.5400496  0.49990133 0.49944221 0.0928721  0.15924499
 0.31690293 0.66954122 0.92508265 0.07371716 0.32338893 0.07404931
 0.07373557 0.13904954 0.61398727 0.10079369 0.09102624 0.08405432
 0.08206655 0.37841083 0.79647527 0.79043405 0.06094507 0.43998151
 0.06788883 0.88709915 0.25092826 0.92027746 0.8001314  0.71822083
 0.08734965 0.86704046 0.82519396 0.30716198 0.13904954 0.706451
 0.32338893 0.07414878 0.18946153 0.89144567 0.83153378 0.96028948
 0.51789787 0.917133   0.08386998 0.06493047 0.66351655 0.93844106
 0.7868825  0.43996051 0.10163869 0.81795341 0.92211861 0.13904692
 0.35523874 0.28952983 0.93308383 0.92816473 0.25863257 0.0736400