In [5]:
import numpy as np 
import pandas as pd 
import os
from matplotlib import pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import category_encoders as ce


In [6]:
df_train = pd.read_csv('D:/pandas/train.csv')
df_test = pd.read_csv('D:/pandas//test.csv')
df_sub = pd.read_csv('D:/pandas/gender_submission.csv')

In [6]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [8]:
df_train.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)


In [9]:
df_test.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)


In [10]:
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


In [11]:
df_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,34.5,0,0,7.8292,,Q
1,3,female,47.0,1,0,7.0,,S
2,2,male,62.0,0,0,9.6875,,Q
3,3,male,27.0,0,0,8.6625,,S
4,3,female,22.0,1,1,12.2875,,S


In [12]:
df_train['Cabin'] = df_train['Cabin'].replace(np.NaN, 'UNKNOWN', regex=True)
df_test['Cabin'] = df_test['Cabin'].replace(np.NaN, 'UNKNOWN', regex=True)

In [13]:
cabin_vals = np.unique(list(df_train['Cabin'].values) + list(df_test['Cabin'].values))
cabin_vals

array(['A10', 'A11', 'A14', 'A16', 'A18', 'A19', 'A20', 'A21', 'A23',
       'A24', 'A26', 'A29', 'A31', 'A32', 'A34', 'A36', 'A5', 'A6', 'A7',
       'A9', 'B10', 'B101', 'B102', 'B11', 'B18', 'B19', 'B20', 'B22',
       'B24', 'B26', 'B28', 'B3', 'B30', 'B35', 'B36', 'B37', 'B38',
       'B39', 'B4', 'B41', 'B42', 'B45', 'B49', 'B5', 'B50',
       'B51 B53 B55', 'B52 B54 B56', 'B57 B59 B63 B66', 'B58 B60', 'B61',
       'B69', 'B71', 'B73', 'B77', 'B78', 'B79', 'B80', 'B82 B84', 'B86',
       'B94', 'B96 B98', 'C101', 'C103', 'C104', 'C105', 'C106', 'C110',
       'C111', 'C116', 'C118', 'C123', 'C124', 'C125', 'C126', 'C128',
       'C130', 'C132', 'C148', 'C2', 'C22 C26', 'C23 C25 C27', 'C28',
       'C30', 'C31', 'C32', 'C39', 'C45', 'C46', 'C47', 'C49', 'C50',
       'C51', 'C52', 'C53', 'C54', 'C55 C57', 'C6', 'C62 C64', 'C65',
       'C68', 'C7', 'C70', 'C78', 'C80', 'C82', 'C83', 'C85', 'C86',
       'C87', 'C89', 'C90', 'C91', 'C92', 'C93', 'C95', 'C97', 'C99', 'D',
       'D

In [14]:
mapping_d = {}
for i, feature in enumerate(cabin_vals):
    mapping_d[feature] = i
    
encoder= ce.OrdinalEncoder(cols=['Cabin'], return_df=True,
                           mapping=[{'col':'Cabin',
                                     'mapping': mapping_d}])

In [15]:
mapping_d

{'A10': 0,
 'A11': 1,
 'A14': 2,
 'A16': 3,
 'A18': 4,
 'A19': 5,
 'A20': 6,
 'A21': 7,
 'A23': 8,
 'A24': 9,
 'A26': 10,
 'A29': 11,
 'A31': 12,
 'A32': 13,
 'A34': 14,
 'A36': 15,
 'A5': 16,
 'A6': 17,
 'A7': 18,
 'A9': 19,
 'B10': 20,
 'B101': 21,
 'B102': 22,
 'B11': 23,
 'B18': 24,
 'B19': 25,
 'B20': 26,
 'B22': 27,
 'B24': 28,
 'B26': 29,
 'B28': 30,
 'B3': 31,
 'B30': 32,
 'B35': 33,
 'B36': 34,
 'B37': 35,
 'B38': 36,
 'B39': 37,
 'B4': 38,
 'B41': 39,
 'B42': 40,
 'B45': 41,
 'B49': 42,
 'B5': 43,
 'B50': 44,
 'B51 B53 B55': 45,
 'B52 B54 B56': 46,
 'B57 B59 B63 B66': 47,
 'B58 B60': 48,
 'B61': 49,
 'B69': 50,
 'B71': 51,
 'B73': 52,
 'B77': 53,
 'B78': 54,
 'B79': 55,
 'B80': 56,
 'B82 B84': 57,
 'B86': 58,
 'B94': 59,
 'B96 B98': 60,
 'C101': 61,
 'C103': 62,
 'C104': 63,
 'C105': 64,
 'C106': 65,
 'C110': 66,
 'C111': 67,
 'C116': 68,
 'C118': 69,
 'C123': 70,
 'C124': 71,
 'C125': 72,
 'C126': 73,
 'C128': 74,
 'C130': 75,
 'C132': 76,
 'C148': 77,
 'C2': 78,
 'C22 C26':

In [16]:
df_train = encoder.fit_transform(df_train)

In [17]:
df_test = encoder.fit_transform(df_test)

In [18]:
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,186,S
1,1,1,female,38.0,1,0,71.2833,106,C
2,1,3,female,26.0,0,0,7.925,186,S
3,1,1,female,35.0,1,0,53.1,70,S
4,0,3,male,35.0,0,0,8.05,186,S


In [19]:
df_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,34.5,0,0,7.8292,186,Q
1,3,female,47.0,1,0,7.0,186,S
2,2,male,62.0,0,0,9.6875,186,Q
3,3,male,27.0,0,0,8.6625,186,S
4,3,female,22.0,1,1,12.2875,186,S


In [20]:
cols_to_encode = ['Sex', 'Embarked']
for col in cols_to_encode:
    le = LabelEncoder()
    le.fit(df_train[col])
    
    df_train[col] = le.transform(df_train[col])
    df_test[col] = le.transform(df_test[col])
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,1,22.0,1,0,7.25,186,2
1,1,1,0,38.0,1,0,71.2833,106,0
2,1,3,0,26.0,0,0,7.925,186,2
3,1,1,0,35.0,1,0,53.1,70,2
4,0,3,1,35.0,0,0,8.05,186,2


In [21]:
df_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,1,34.5,0,0,7.8292,186,1
1,3,0,47.0,1,0,7.0,186,2
2,2,1,62.0,0,0,9.6875,186,1
3,3,1,27.0,0,0,8.6625,186,2
4,3,0,22.0,1,1,12.2875,186,2


In [22]:
df_train['Age'] = df_train['Age'].replace(np.NaN, -1)
df_test['Age'] = df_test['Age'].replace(np.NaN, -1)
df_train['Fare'] = df_train['Fare'].replace(np.NaN, -1)
df_test['Fare'] = df_test['Fare'].replace(np.NaN, -1)

In [23]:
df_train[df_train.isna().any(axis=1)]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked


In [24]:
df_test[df_test.isna().any(axis=1)]

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked


#Feature Engineering

In [25]:
def age_group(x):
    if x < 18:
        return 'under_18'
    elif x >= 18 and x <=30:
        return '18_to_30'
    else:
        return 'above_30'
df_train['Age_Group'] = df_train['Age'].apply(lambda x: age_group(x))
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Age_Group
0,0,3,1,22.0,1,0,7.25,186,2,18_to_30
1,1,1,0,38.0,1,0,71.2833,106,0,above_30
2,1,3,0,26.0,0,0,7.925,186,2,18_to_30
3,1,1,0,35.0,1,0,53.1,70,2,above_30
4,0,3,1,35.0,0,0,8.05,186,2,above_30


In [26]:
df_test['Age_Group'] = df_test['Age'].apply(lambda x: age_group(x))
df_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Age_Group
0,3,1,34.5,0,0,7.8292,186,1,above_30
1,3,0,47.0,1,0,7.0,186,2,above_30
2,2,1,62.0,0,0,9.6875,186,1,above_30
3,3,1,27.0,0,0,8.6625,186,2,18_to_30
4,3,0,22.0,1,1,12.2875,186,2,18_to_30


In [27]:
cols_to_encode = ['Age_Group']
for col in cols_to_encode:
    le = LabelEncoder()
    le.fit(df_train[col])
    
    df_train[col] = le.transform(df_train[col])
    df_test[col] = le.transform(df_test[col])
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Age_Group
0,0,3,1,22.0,1,0,7.25,186,2,0
1,1,1,0,38.0,1,0,71.2833,106,0,1
2,1,3,0,26.0,0,0,7.925,186,2,0
3,1,1,0,35.0,1,0,53.1,70,2,1
4,0,3,1,35.0,0,0,8.05,186,2,1


In [28]:
df_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Age_Group
0,3,1,34.5,0,0,7.8292,186,1,1
1,3,0,47.0,1,0,7.0,186,2,1
2,2,1,62.0,0,0,9.6875,186,1,1
3,3,1,27.0,0,0,8.6625,186,2,0
4,3,0,22.0,1,1,12.2875,186,2,0
