#### Titanic adventure

In [65]:
# get data

import pandas as pd
import numpy as np

# visualizations
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns


%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
plt.rcParams['figure.figsize'] = 12,8

# sklearn
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline


In [102]:
train = pd.read_csv('titanic data/train.csv')
test = pd.read_csv('titanic data/test.csv')

train = train.drop(['PassengerId', 'Ticket', 'Name'], axis=1)
test = test.drop(['PassengerId', 'Ticket', 'Name'], axis=1)

In [72]:
train

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.2500,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.9250,,S
3,1,1,female,35.0,1,0,53.1000,C123,S
4,0,3,male,35.0,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,,S
887,1,1,female,19.0,0,0,30.0000,B42,S
888,0,3,female,,1,2,23.4500,,S
889,1,1,male,26.0,0,0,30.0000,C148,C


In [21]:
train.describe()
print(train.columns)

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin',
       'Embarked'],
      dtype='object')


#### checking NA, over 70% in train for cabin is NA - remove

In [22]:
train.isna().sum()
print(train.shape)

(891, 9)


In [103]:
if 'Cabin' in train.columns:
    train = train.drop(['Cabin'], axis=1)

if 'Cabin' in test.columns:
    test = test.drop(['Cabin'], axis=1)

#### before imputer: label encoding

###### impute data for all missing column
* instead of doing this separately for each columns - single.impute

label encoder (ordinar?) vs onehotencoder
* first keeps one column, just 1/2/3/4/5..
* second creates n dummy columns

depending on situation dummy might be better.
Example:
* when we have bad/medium/string/superstrong etc - it most case there is no differecnce betwwen best and worse vs medium and worse - just different labels. But for ordinal encoder program might think that it is better to use 3/4/5 ... - > higher grades

---
so in some cases/ most? one hot encoder -> full encoder

#### train['Sex'] != train[['Sex']] !!


In [24]:

# for it ofc we need categorical staff
z = OneHotEncoder()
encoder_result = z.fit_transform(train[['Sex']])
encoder_result.toarray()
print(z.categories_)

[array(['female', 'male'], dtype=object)]


In [25]:
## numerical pipeline - imputer

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median'))
])

num_pipeline.fit_transform(train[['Age']])[0] 

array([22.])

In [26]:
# categorical encoder - onehotencoding

cat_pipeline = Pipeline([
    ('encoder', OneHotEncoder())
])

cat_pipeline.fit_transform(train[['Sex']]).toarray()[0]

array([0., 1.])

In [104]:
train["Embarked"] = train["Embarked"].fillna("C")
test["Fare"] = test["Fare"].fillna(test["Fare"].median())
test["Fare"] 

0        7.8292
1        7.0000
2        9.6875
3        8.6625
4       12.2875
         ...   
413      8.0500
414    108.9000
415      7.2500
416      8.0500
417     22.3583
Name: Fare, Length: 418, dtype: float64

In [82]:
train.isna().sum()
test.isna().sum()

Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         0
Embarked     0
dtype: int64

In [108]:
## for train

numerical_columns = train.select_dtypes(include='number').columns
categorical_columns = list(set(train.columns) - set(numerical_columns))
print(numerical_columns)
print(categorical_columns)

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, numerical_columns),
    ("cat", cat_pipeline, categorical_columns)
])

train_cleaned = full_pipeline.fit_transform(train)

pipeline_for_names = Pipeline([
    ('full', full_pipeline)
])
cat_column = pipeline_for_names.named_steps['full'].transformers_[1][1].named_steps['encoder'].get_feature_names(categorical_columns)
train_df = pd.DataFrame(train_cleaned, columns=(list(numerical_columns)+list(cat_column)))

Index(['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')
['Embarked', 'Sex']


In [109]:
## for test


numerical_columns = test.select_dtypes(include='number').columns
categorical_columns = list(set(test.columns) - set(numerical_columns))
print(numerical_columns)
print(categorical_columns)

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, numerical_columns),
    ("cat", cat_pipeline, categorical_columns)
])

test_cleaned = full_pipeline.fit_transform(test)

pipeline_for_names = Pipeline([
    ('full', full_pipeline)
])
cat_column = pipeline_for_names.named_steps['full'].transformers_[1][1].named_steps['encoder'].get_feature_names(categorical_columns)
test_df = pd.DataFrame(test_cleaned, columns=(list(numerical_columns)+list(cat_column)))

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')
['Embarked', 'Sex']


In [111]:
test_df

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
0,3.0,34.5,0.0,0.0,7.8292,0.0,1.0,0.0,0.0,1.0
1,3.0,47.0,1.0,0.0,7.0000,0.0,0.0,1.0,1.0,0.0
2,2.0,62.0,0.0,0.0,9.6875,0.0,1.0,0.0,0.0,1.0
3,3.0,27.0,0.0,0.0,8.6625,0.0,0.0,1.0,0.0,1.0
4,3.0,22.0,1.0,1.0,12.2875,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
413,3.0,27.0,0.0,0.0,8.0500,0.0,0.0,1.0,0.0,1.0
414,1.0,39.0,0.0,0.0,108.9000,1.0,0.0,0.0,1.0,0.0
415,3.0,38.5,0.0,0.0,7.2500,0.0,0.0,1.0,0.0,1.0
416,3.0,27.0,0.0,0.0,8.0500,0.0,0.0,1.0,0.0,1.0


In [73]:
train_df

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
0,0.0,3.0,22.0,1.0,0.0,7.2500,0.0,0.0,1.0,0.0,1.0
1,1.0,1.0,38.0,1.0,0.0,71.2833,1.0,0.0,0.0,1.0,0.0
2,1.0,3.0,26.0,0.0,0.0,7.9250,0.0,0.0,1.0,1.0,0.0
3,1.0,1.0,35.0,1.0,0.0,53.1000,0.0,0.0,1.0,1.0,0.0
4,0.0,3.0,35.0,0.0,0.0,8.0500,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
886,0.0,2.0,27.0,0.0,0.0,13.0000,0.0,0.0,1.0,0.0,1.0
887,1.0,1.0,19.0,0.0,0.0,30.0000,0.0,0.0,1.0,1.0,0.0
888,0.0,3.0,28.0,1.0,2.0,23.4500,0.0,0.0,1.0,1.0,0.0
889,1.0,1.0,26.0,0.0,0.0,30.0000,1.0,0.0,0.0,0.0,1.0


## TASK:
make pipeline with models
make predictions