In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as stats
import statsmodels.formula.api as sfa
import statsmodels.api as sma

# VIF 
from statsmodels.stats.outliers_influence import variance_inflation_factor 

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, LeaveOneOut, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE, RFECV
# Terminal --> pip install mlxtend

#from mlxtend.feature_selection

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

* Feature engineering
* issing values
* one hot encoding

In [3]:
# combine the dataset
combined = pd.concat([train,test],ignore_index=True)

In [4]:
# checking the missing values
combined.isnull().sum().sort_values(ascending=False)

Cabin          1014
Survived        418
Age             263
Embarked          2
Fare              1
PassengerId       0
Pclass            0
Name              0
Sex               0
SibSp             0
Parch             0
Ticket            0
dtype: int64

In [5]:
combined.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [6]:
combined.columns


Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [7]:
combined['Cabin'].unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

In [8]:
cabins = ['C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64', 'E24', 'C90', 'C45', 'E8', 'B101', 'D45', 'C46', 'D30',
       'E121', 'D11', 'E77', 'F38', 'B3', 'D6', 'B82 B84', 'D17', 'A36',
       'B102', 'B69', 'E49', 'C47', 'D28', 'E17', 'A24', 'C50', 'B42',
       'C148', 'B45', 'B36', 'A21', 'D34', 'A9', 'C31', 'B61', 'C53',
       'D43', 'C130', 'C132', 'C55 C57', 'C116', 'F', 'A29', 'C6', 'C28',
       'C51', 'C97', 'D22', 'B10', 'E45', 'E52', 'A11', 'B11', 'C80',
       'C89', 'F E46', 'B26', 'F E57', 'A18', 'E60', 'E39 E41',
       'B52 B54 B56', 'C39', 'B24', 'D40', 'D38', 'C105']

In [9]:
def cabin_labels(x):
    if x in cabins:
        return('cabin_avail')
    else:
        return('missing')

In [10]:
combined.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [11]:
# Lets apply this on cabin column
combined['cabin_cat'] = combined['Cabin'].apply(cabin_labels)

In [12]:
pd.crosstab(combined.Survived, combined.cabin_cat)

cabin_cat,cabin_avail,missing
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,68,481
1.0,136,206


## Name

In [13]:
combined.Name[899].split(', ')[1].split('. ')[0]

'Mrs'

In [14]:
combined.Name[899].split(', ')[1].split('. ')[0]

'Mrs'

In [15]:
titless = []
for i in combined.Name:
    titless.append(i.split(', ')[1].split('. ')[0])

In [16]:
combined['Titles']=pd.Series(titless)

In [17]:
combined.Titles.unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer', 'Dona'], dtype=object)

In [18]:
titles_ignore = ['Don', 'Rev', 'Dr', 'Mme',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer', 'Dona']

def notitle(x):
    if x in titles_ignore:
        return('others')
    else:
        return(x)

In [19]:
combined['Titles'] = combined['Titles'].apply(notitle)

In [20]:
combined.drop(['PassengerId','Name','Cabin','Ticket'],axis = 1, inplace=True)

### Family

In [21]:
combined['Family'] = combined.SibSp+combined.Parch+1

In [22]:
def parivar(x):
    if x == 1:
        return('solo')
    elif x ==2:
        return('duo')
    elif x<=4:
        return('small')
    else:
        return('big')

In [23]:
combined['family_cat'] = combined.Family.apply(parivar)

In [24]:
pd.crosstab(combined.family_cat, combined.Survived)

Survived,0.0,1.0
family_cat,Unnamed: 1_level_1,Unnamed: 2_level_1
big,52,10
duo,72,89
small,51,80
solo,374,163


In [28]:
combined.groupby(by = 'Titles')['Age'].describe().T

Titles,Master,Miss,Mr,Mrs,Ms,others
count,53.0,210.0,581.0,170.0,1.0,31.0
mean,5.482642,21.774238,32.252151,36.994118,28.0,43.129032
std,4.161554,12.249077,12.422089,12.901767,,12.309189
min,0.33,0.17,11.0,14.0,28.0,23.0
25%,2.0,15.0,23.0,27.0,28.0,32.5
50%,4.0,22.0,29.0,35.5,28.0,45.0
75%,9.0,30.0,39.0,46.5,28.0,52.5
max,14.5,63.0,80.0,76.0,28.0,70.0


### Missing values

In [33]:
# treating the missing value in columns Age
missing_titles = combined.loc[combined.Age.isnull()]['Titles'].unique()

In [41]:
for i in missing_titles:
    combined.loc[combined.Age.isnull(), 'Age'] = combined.loc[combined.Titles==i,'Age'].median()

In [51]:
# Treating the missing value in column Embarked
combined.loc[combined.Embarked.isnull(),'Embarked'] = combined.Embarked.mode()[0]

In [56]:
# Missing the null values in Fare
combined.loc[combined.Fare.isnull(),'Fare'] = combined.Fare.median()

In [58]:
# Split the data back in train and test
newtrain = combined.loc[0:train.shape[0]-1, ]
newtest = combined.loc[train.shape[0]:, ]

newtrain.shape, newtest.shape

((891, 12), (418, 12))

In [61]:
# lets split the data in x and y
X = newtrain.drop(['Survived'],axis = 1)
y = newtrain.Survived.astype(int)

newtest.drop(['Survived'],axis = 1, inplace = True)

### Model building

* The first model in classicfication which is alo known as base_model should be a prediction of 0

In [68]:
submission = pd.DataFrame({'PassengerId':test.PassengerId,'Survived':0})

submission.to_csv('basemodeltitanic.csv',index = False)

In [69]:
cd

C:\Users\sidharth nandal


In [71]:
# dummify the data
newX = pd.get_dummies(X, drop_first=True)
newtest = pd.get_dummies(newtest,drop_first=True)

In [73]:
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression()
pred = lg.fit(newX,y).predict(newtest)

In [82]:
submission = pd.DataFrame({'PassengerId':test.PassengerId,'Survived':pred})

submission.to_csv('logistic_titanic.csv',index = False)

In [83]:
cd

C:\Users\sidharth nandal


In [66]:
combined.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,cabin_cat,Titles,Family,family_cat
0,0.0,3,male,22.0,1,0,7.25,S,missing,Mr,2,duo
1,1.0,1,female,38.0,1,0,71.2833,C,cabin_avail,Mrs,2,duo
