In [307]:
# Wrangle and clean data
import pandas as pd
import numpy as np
import random as rdn

# Visualize Data
import seaborn as sns
import matplotlib as plt

# Model, predict, and solve
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [308]:
titanic_train = pd.read_csv('titanic dataset/train.csv')
titanic_test = pd.read_csv('titanic dataset/test.csv')

In [309]:
train_y = titanic_train.Survived
train_X = titanic_train.loc[:, titanic_train.columns != 'Survived']
X = pd.concat([train_X, titanic_test], axis=0)
X.shape

(1309, 11)

In [310]:
X.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [311]:
# todo: drop passengerId, Ticket and Cabin 
X = X.drop(['PassengerId', 'Ticket', 'Cabin'], axis=1)
X.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


In [312]:
# Convert Sex to binary digits
X.Sex = X.Sex.map({'male': 1, 'female': 0})
X.Sex.describe()

count    1309.000000
mean        0.644003
std         0.478997
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: Sex, dtype: float64

In [313]:
# X.Age.describe(), X.Age.isnull()

In [314]:
# todo: fill out nulls in Age by the median of Pclass-Sex strata plus randomness
Age_strata_avg = X.groupby(['Sex', 'Pclass']).Age.median()
Age_std = X.Age.std()
Age_strata_avg, Age_std

(Sex  Pclass
 0    1         36.0
      2         28.0
      3         22.0
 1    1         42.0
      2         29.5
      3         25.0
 Name: Age, dtype: float64,
 14.413493211271321)

In [315]:
for i in list(range(0,2)):
    for j in list(range(1,4)):
        X.Age.loc[(X.Sex == i) & (X.Pclass == j) & (X.Age.isnull())] = Age_strata_avg[(i, j)] + rdn.gauss(0, 1)

X.Age = round(X.Age, 0)
X.Age.describe()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.Age.loc[(X.Sex == i) & (X.Pclass == j) & (X.Age.isnull())] = Age_strata_avg[(i, j)] + rdn.gauss(0, 1)


count    1309.000000
mean       29.254393
std        13.212342
min         0.000000
25%        22.000000
50%        26.000000
75%        36.000000
max        80.000000
Name: Age, dtype: float64

In [316]:
def AgeBand(Age):
    if Age <= 14:
        return 0
    elif Age <= 24:
        return 1
    elif Age <= 64:
        return 2
    else:
        return 3

In [317]:
# todo: set age bands
X['AgeBand'] = X.Age.map(AgeBand)
X.AgeBand

0      1
1      2
2      2
3      2
4      2
      ..
413    2
414    2
415    2
416    2
417    2
Name: AgeBand, Length: 1309, dtype: int64

In [318]:
# todo: extract titles from Name, encode them by ordinal numbers
X['Title'] = X.Name.str.extract('(\w+\.)')
X.groupby('Title').Title.count().sort_values(ascending = False)

Title
Mr.          757
Miss.        260
Mrs.         197
Master.       61
Rev.           8
Dr.            8
Col.           4
Mlle.          2
Ms.            2
Major.         2
Mme.           1
Capt.          1
Lady.          1
Jonkheer.      1
Dona.          1
Don.           1
Countess.      1
Sir.           1
Name: Title, dtype: int64

In [319]:
Minor_titles = ['Dr.', 'Rev.', 'Major.', 'Col.', 'Capt.', 'Lady.', 'Jonkheer.', 'Don.',\
    'Countess.', 'Sir.', 'Dona.']
X.Title.replace(to_replace = Minor_titles, value = 'Minority', inplace = True)
X.Title.replace(to_replace = {'Mlle.': 'Miss.', 'Mme.': 'Miss.', 'Ms.': 'Mrs.'},\
     inplace = True)
X.groupby('Title').Title.count().sort_values(ascending = False)
# X.Title.isnull().sum()

Title
Mr.         757
Miss.       263
Mrs.        199
Master.      61
Minority     29
Name: Title, dtype: int64

In [320]:
X.Title = X.Title.map({'Mr.':0, 'Miss.': 1, 'Mrs.': 2, 'Master.': 3, 'Minority': 4})
X.groupby('Title').Title.count().sort_values(ascending = False)

Title
0    757
1    263
2    199
3     61
4     29
Name: Title, dtype: int64

In [321]:
# todo: combine SibSp and Parch in to Fsize, encode by numbers to indicate alone, small or large family size
X['Fsize'] = X.SibSp + X.Parch + 1
X['Family_band'] = X.Fsize.map(lambda x: 0 if x == 1 else (1 if x <= 5 else 2))
X.Family_band.describe()

count    1309.000000
mean        0.442322
std         0.581898
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         2.000000
Name: Family_band, dtype: float64

In [322]:
# todo: fill out missing values in Fare, set fare bands
null_fare_indx = np.where(X.Fare.isnull())
null_fare_indx, X.Fare.isnull().sum()

((array([1043], dtype=int64),), 1)

In [323]:
# X.loc[1403,'Fare'] = 8.05, 
X.shape

(1309, 12)

In [324]:
X.loc[1043,'Fare'] = 8.05
X.loc[1043,'Fare'], X.Fare.isnull().loc[1043]

(8.05, False)

In [325]:
Qs = X.Fare.quantile(q = [0, 0.25, 0.5, 0.75, 1], interpolation='lower')
FareRange = pd.cut(X['Fare'], bins=Qs, include_lowest = True)
FareRangeDistinct = FareRange.unique()
UniqueFareIntervals = {FareRangeDistinct[0]:0, FareRangeDistinct[2]:1, FareRangeDistinct[3]:2, FareRangeDistinct[1]:3}
FareRange.unique()

[(-0.001, 7.896], (31.275, 512.329], (7.896, 14.454], (14.454, 31.275], NaN]
Categories (4, interval[float64, right]): [(-0.001, 7.896] < (7.896, 14.454] < (14.454, 31.275] < (31.275, 512.329]]

In [326]:
X['FareScale'] = FareRange.map(UniqueFareIntervals)
X.loc[X.Fare.isnull(), 'Farescale'] = 2
X.FareScale.describe()

count     1309
unique       4
top          0
freq       337
Name: FareScale, dtype: int64

In [327]:
# todo: encode embarked by ordinal numbers
X.Embarked = X.Embarked.map({'C':0, 'Q':1, 'S':2})

In [328]:
# Drop unnecessary columns
X.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,AgeBand,Title,Fsize,Family_band,FareScale,Farescale
0,3.0,"Braund, Mr. Owen Harris",1.0,22.0,1.0,0.0,7.25,2.0,1.0,0.0,2.0,1.0,0,
1,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0.0,38.0,1.0,0.0,71.2833,0.0,2.0,2.0,2.0,1.0,3,
2,3.0,"Heikkinen, Miss. Laina",0.0,26.0,0.0,0.0,7.925,2.0,2.0,1.0,1.0,0.0,1,
3,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0.0,35.0,1.0,0.0,53.1,2.0,2.0,2.0,2.0,1.0,3,
4,3.0,"Allen, Mr. William Henry",1.0,35.0,0.0,0.0,8.05,2.0,2.0,0.0,1.0,0.0,1,


In [329]:
X = X.drop(['Name', 'Age', 'Parch', 'SibSp', 'Fare', 'Farescale'], axis = 1)

Prediction

In [338]:
X.describe()

Unnamed: 0,Pclass,Sex,Embarked,AgeBand,Title,Fsize,Family_band
count,1309.0,1309.0,1310.0,1309.0,1309.0,1309.0,1309.0
mean,2.294882,0.644003,1.489313,1.565317,0.733384,1.883881,0.442322
std,0.837836,0.478997,0.816816,0.657621,1.02266,1.583639,0.581898
min,1.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,2.0,0.0,1.0,1.0,0.0,1.0,0.0
50%,3.0,1.0,2.0,2.0,0.0,1.0,0.0
75%,3.0,1.0,2.0,2.0,1.0,2.0,1.0
max,3.0,1.0,2.0,3.0,4.0,11.0,2.0


In [343]:
X.Embarked[X.Embarked.isnull()] = 0
X = X.drop(1309, axis = 'index')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.Embarked[X.Embarked.isnull()] = 0


KeyError: '[1309] not found in axis'

In [344]:
train_X = X.iloc[:891, ]
val_X  = X.iloc[891:, ]
LR = LogisticRegression()
LR.fit(train_X, train_y)
prediction = LR.predict(val_X)

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [345]:
X.iloc[:891, ].index

Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            881, 882, 883, 884, 885, 886, 887, 888, 889, 890],
           dtype='int64', length=891)

1308