# Final Submission

In this notebook we will process the Entire train and test dataset based on the notebook  "Titanic Keeping All Features" and also train a neural network for the same task and submit our final predictions.

In [1]:
# Importing Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as plt

In [2]:
train_dataset = pd.read_csv('titanic_data/train.csv')

In [3]:
test_dataset = pd.read_csv('titanic_data/test.csv')

In [4]:
train_dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
test_dataset.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


We have Loaded both Train and Test Data in separate dataframes.

## Processing Train Data

In [6]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [7]:
train_dataset.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [8]:
train_dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Since EDA Has been already done on the previous files therefore we will only focus our interest on the Data Preprocessing, Feature Engineering and Model fitting on the datasets.

## Processing the Name Column for Titles

In [9]:
# Processing the Titles, Using Set instead of a list.
title = set()
for name in train_dataset["Name"]:
    title.add(name.split(',')[1].split('.')[0].strip())

In [10]:
title

{'Capt',
 'Col',
 'Don',
 'Dr',
 'Jonkheer',
 'Lady',
 'Major',
 'Master',
 'Miss',
 'Mlle',
 'Mme',
 'Mr',
 'Mrs',
 'Ms',
 'Rev',
 'Sir',
 'the Countess'}

In [11]:
# Making a dictionary of all the titles and mapping it to a new title column
Title_dictionary = {
    'Capt': "Officer",
    'Col': "Officer",
    'Dr': "Officer",
    'Major': "Officer",
    'Rev': "Officer",
    'Jonkheer': "Royalty",
    'Don': "Royalty",
    'Sir': "Royalty",
    'the Countess': "Royalty",
    'Lady': "Royalty",
    'Ms': "Mrs",
    'Mrs': "Mrs",
    'Mr': "Mr",
    'Mme': "Mrs",
    'Mlle': "Miss",
    'Miss': "Miss",
    'Master': "Master",
}

def get_titles():
    train_dataset["Title"] = train_dataset["Name"].map(lambda name:name.split(',')[1].split('.')[0].strip())
    
    # Mapping Each title to title dictionary
    train_dataset["Title"] = train_dataset.Title.map(Title_dictionary)
    
    return train_dataset

In [12]:
train_dataset = get_titles()

In [13]:
train_dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr


In [14]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
 12  Title        891 non-null    object 
dtypes: float64(2), int64(5), object(6)
memory usage: 90.6+ KB


## Processing The Age Column

In [15]:
grp_train = train_dataset.groupby(['Sex', 'Pclass','Title','Fare'])
grp_train_med = grp_train.median()
grp_train_med = grp_train_med.reset_index()[['Sex', 'Pclass','Title', 'Fare' ,'Age']]
grp_train_med.head()

Unnamed: 0,Sex,Pclass,Title,Fare,Age
0,female,1,Miss,26.2833,19.0
1,female,1,Miss,26.55,58.0
2,female,1,Miss,28.7125,50.0
3,female,1,Miss,30.0,19.0
4,female,1,Miss,31.0,30.0


In [16]:
def fill_age(row):
    condition = (
        (grp_train_med['Sex'] == row['Sex']) & 
        (grp_train_med['Title'] == row['Title']) & 
        (grp_train_med['Pclass'] == row['Pclass'])&
        (grp_train_med['Fare'] == row['Fare'])
    )
    if np.isnan(grp_train_med[condition]['Age'].values[0]):
        condition = (
            (grp_train_med['Sex'] == row['Sex']) &
            (grp_train_med['Pclass'] == row['Pclass'])
        )
    return grp_train_med[condition]['Age'].values[0]

def process_age():
    global train_dataset
    
    train_dataset['Age'] = train_dataset.apply(lambda row: fill_age(row) if np.isnan(row['Age']) else row['Age'], axis=1)
    
    return train_dataset

In [17]:
train_dataset = process_age()

In [18]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
 12  Title        891 non-null    object 
dtypes: float64(2), int64(5), object(6)
memory usage: 90.6+ KB


## Processing The Embarked Column

In [19]:
print(train_dataset.Embarked.isnull().sum())

2


In [20]:
train_dataset["Embarked"].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [21]:
most_freq_embarked = train_dataset["Embarked"][0]
most_freq_embarked

'S'

In [22]:
train_dataset["Embarked"].fillna(most_freq_embarked, inplace=True)

In [23]:
train_dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr


In [24]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     891 non-null    object 
 12  Title        891 non-null    object 
dtypes: float64(2), int64(5), object(6)
memory usage: 90.6+ KB


## Processing The Cabin Column

In [25]:
train_dataset["Cabin"].nunique()

147

In [26]:
train_dataset["Cabin"].unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

The First letter in the Cabin i.e-A, B, C, etc. represents Deck so the feature can be extracted and further combined with Pclass to make a new Feature.

In [27]:
train_dataset["Deck"] = train_dataset["Cabin"].apply(lambda s: s[0] if pd.notnull(s) else 'M') #M for missing

In [28]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     891 non-null    object 
 12  Title        891 non-null    object 
 13  Deck         891 non-null    object 
dtypes: float64(2), int64(5), object(7)
memory usage: 97.6+ KB


## Checking the ticket column.

In [29]:
train_dataset["Ticket"].describe()

count      891
unique     681
top       1601
freq         7
Name: Ticket, dtype: object

In [30]:
train_dataset["Ticket_Frequency"] = train_dataset.groupby('Ticket')['Ticket'].transform('count')

In [31]:
train_dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Deck,Ticket_Frequency
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,M,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,M,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,C,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr,M,1


## Creating a Family column by adding SibSp and Parch Column.

In [32]:
train_dataset["Family"] = train_dataset["SibSp"] + train_dataset["Parch"] +1 # +1 for single Passengers

In [33]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PassengerId       891 non-null    int64  
 1   Survived          891 non-null    int64  
 2   Pclass            891 non-null    int64  
 3   Name              891 non-null    object 
 4   Sex               891 non-null    object 
 5   Age               891 non-null    float64
 6   SibSp             891 non-null    int64  
 7   Parch             891 non-null    int64  
 8   Ticket            891 non-null    object 
 9   Fare              891 non-null    float64
 10  Cabin             204 non-null    object 
 11  Embarked          891 non-null    object 
 12  Title             891 non-null    object 
 13  Deck              891 non-null    object 
 14  Ticket_Frequency  891 non-null    int64  
 15  Family            891 non-null    int64  
dtypes: float64(2), int64(7), object(7)
memory us

In [34]:
train_dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Deck,Ticket_Frequency,Family
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,M,1,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,C,1,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,M,1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,C,2,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr,M,1,1


## Encoding the Categorical columns

In [35]:
Sex = pd.get_dummies(train_dataset["Sex"], drop_first=True)
Embark = pd.get_dummies(train_dataset["Embarked"], drop_first=True)
Title = pd.get_dummies(train_dataset["Title"], drop_first=True)
Deck = pd.get_dummies(train_dataset["Deck"], drop_first=True)

## Adding the new dataFrames to the train_dataset

In [36]:
train_dataset = pd.concat([train_dataset, Sex, Embark, Title, Deck], axis=1)

### Deleting the useless columns

In [37]:
train_dataset.drop(["PassengerId", "Name", "Ticket", "Cabin", "Embarked", "Title", "Deck", "SibSp", "Parch", "Sex"], axis =1, inplace=True)

In [38]:
train_dataset.head()

Unnamed: 0,Survived,Pclass,Age,Fare,Ticket_Frequency,Family,male,Q,S,Miss,...,Officer,Royalty,B,C,D,E,F,G,M,T
0,0,3,22.0,7.25,1,2,1,0,1,0,...,0,0,0,0,0,0,0,0,1,0
1,1,1,38.0,71.2833,1,2,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,1,3,26.0,7.925,1,1,0,0,1,1,...,0,0,0,0,0,0,0,0,1,0
3,1,1,35.0,53.1,2,2,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
4,0,3,35.0,8.05,1,1,1,0,1,0,...,0,0,0,0,0,0,0,0,1,0


In [39]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Survived          891 non-null    int64  
 1   Pclass            891 non-null    int64  
 2   Age               891 non-null    float64
 3   Fare              891 non-null    float64
 4   Ticket_Frequency  891 non-null    int64  
 5   Family            891 non-null    int64  
 6   male              891 non-null    uint8  
 7   Q                 891 non-null    uint8  
 8   S                 891 non-null    uint8  
 9   Miss              891 non-null    uint8  
 10  Mr                891 non-null    uint8  
 11  Mrs               891 non-null    uint8  
 12  Officer           891 non-null    uint8  
 13  Royalty           891 non-null    uint8  
 14  B                 891 non-null    uint8  
 15  C                 891 non-null    uint8  
 16  D                 891 non-null    uint8  
 1

# Processing Test Data

In [40]:
test_dataset.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [41]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


We will process the data similar to the preprocessing and Feature Engineering done on the train_dataset.

## Processing the Name Column for Titles in the Test_dataset

In [42]:
# Processing the Titles, Using Set instead of a list.
title_test = set()
for name in test_dataset["Name"]:
    title_test.add(name.split(',')[1].split('.')[0].strip())

In [43]:
title_test

{'Col', 'Dona', 'Dr', 'Master', 'Miss', 'Mr', 'Mrs', 'Ms', 'Rev'}

Dona is Spanish title of address equivalent to Mrs or Madam. Therefor just for this case we will fill it manually, as it is just one row.

In [44]:
# Making a dictionary of all the titles and mapping it to a new title column
Title_dictionary = {
    'Capt': "Officer",
    'Col': "Officer",
    'Dr': "Officer",
    'Major': "Officer",
    'Rev': "Officer",
    'Jonkheer': "Royalty",
    'Don': "Royalty",
    'Sir': "Royalty",
    'the Countess': "Royalty",
    'Lady': "Royalty",
    'Ms': "Mrs",
    'Mrs': "Mrs",
    'Mr': "Mr",
    'Mme': "Mrs",
    'Mlle': "Miss",
    'Miss': "Miss",
    'Master': "Master",
    'Dona':"Mrs"
}

def get_test_titles():
    test_dataset["Title"] = test_dataset["Name"].map(lambda name:name.split(',')[1].split('.')[0].strip())
    
    # Mapping Each title to title dictionary
    test_dataset["Title"] = test_dataset.Title.map(Title_dictionary)
    
    return test_dataset

In [45]:
test_dataset = get_test_titles()

In [46]:
test_dataset.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,Mr
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,Mrs
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,Mr
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,Mr
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,Mrs


In [47]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
 11  Title        418 non-null    object 
dtypes: float64(2), int64(4), object(6)
memory usage: 39.3+ KB


## Processing The Age Column

In [48]:
grp_test = test_dataset.groupby(['Sex', 'Pclass','Title','Fare'])
grp_test_med = grp_test.median()
grp_test_med = grp_test_med.reset_index()[['Sex', 'Pclass','Title', 'Fare' ,'Age']]
grp_test_med.head()

Unnamed: 0,Sex,Pclass,Title,Fare,Age
0,female,1,Miss,26.55,21.0
1,female,1,Miss,27.7208,33.0
2,female,1,Miss,31.6792,36.0
3,female,1,Miss,59.4,22.0
4,female,1,Miss,61.9792,22.0


In [49]:
def fill_test_age(row):
    condition = (
        (grp_test_med['Sex'] == row['Sex']) & 
        (grp_test_med['Title'] == row['Title']) & 
        (grp_test_med['Pclass'] == row['Pclass'])&
        (grp_test_med['Fare'] == row['Fare'])
    )
    if np.isnan(grp_test_med[condition]['Age'].values[0]):
        condition = (
            (grp_test_med['Sex'] == row['Sex']) &
            (grp_test_med['Pclass'] == row['Pclass'])
        )
    return grp_test_med[condition]['Age'].values[0]

def process_test_age():
    global test_dataset
    
    test_dataset['Age'] = test_dataset.apply(lambda row: fill_test_age(row) if np.isnan(row['Age']) else row['Age'], axis=1)
    
    return test_dataset

In [50]:
test_dataset = process_test_age()

In [51]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          418 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
 11  Title        418 non-null    object 
dtypes: float64(2), int64(4), object(6)
memory usage: 39.3+ KB


## Processing The Cabin Column

In [52]:
test_dataset["Cabin"].nunique()

76

In [53]:
test_dataset["Cabin"].unique()

array([nan, 'B45', 'E31', 'B57 B59 B63 B66', 'B36', 'A21', 'C78', 'D34',
       'D19', 'A9', 'D15', 'C31', 'C23 C25 C27', 'F G63', 'B61', 'C53',
       'D43', 'C130', 'C132', 'C101', 'C55 C57', 'B71', 'C46', 'C116',
       'F', 'A29', 'G6', 'C6', 'C28', 'C51', 'E46', 'C54', 'C97', 'D22',
       'B10', 'F4', 'E45', 'E52', 'D30', 'B58 B60', 'E34', 'C62 C64',
       'A11', 'B11', 'C80', 'F33', 'C85', 'D37', 'C86', 'D21', 'C89',
       'F E46', 'A34', 'D', 'B26', 'C22 C26', 'B69', 'C32', 'B78',
       'F E57', 'F2', 'A18', 'C106', 'B51 B53 B55', 'D10 D12', 'E60',
       'E50', 'E39 E41', 'B52 B54 B56', 'C39', 'B24', 'D28', 'B41', 'C7',
       'D40', 'D38', 'C105'], dtype=object)

In [54]:
test_dataset["Deck"] = test_dataset["Cabin"].apply(lambda s: s[0] if pd.notnull(s) else 'M') #M for missing

In [55]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          418 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
 11  Title        418 non-null    object 
 12  Deck         418 non-null    object 
dtypes: float64(2), int64(4), object(7)
memory usage: 42.6+ KB


## Checking the ticket column.

In [56]:
test_dataset["Ticket"].describe()

count          418
unique         363
top       PC 17608
freq             5
Name: Ticket, dtype: object

In [57]:
test_dataset["Ticket_Frequency"] = test_dataset.groupby('Ticket')['Ticket'].transform('count')

In [58]:
test_dataset.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Deck,Ticket_Frequency
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,Mr,M,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,Mrs,M,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,Mr,M,1
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,Mr,M,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,Mrs,M,1


In [59]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PassengerId       418 non-null    int64  
 1   Pclass            418 non-null    int64  
 2   Name              418 non-null    object 
 3   Sex               418 non-null    object 
 4   Age               418 non-null    float64
 5   SibSp             418 non-null    int64  
 6   Parch             418 non-null    int64  
 7   Ticket            418 non-null    object 
 8   Fare              417 non-null    float64
 9   Cabin             91 non-null     object 
 10  Embarked          418 non-null    object 
 11  Title             418 non-null    object 
 12  Deck              418 non-null    object 
 13  Ticket_Frequency  418 non-null    int64  
dtypes: float64(2), int64(5), object(7)
memory usage: 45.8+ KB


## Creating a Family column by adding SibSp and Parch Column.

In [60]:
test_dataset["Family"] = test_dataset["SibSp"] + test_dataset["Parch"] +1 # +1 for single Passengers

In [61]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PassengerId       418 non-null    int64  
 1   Pclass            418 non-null    int64  
 2   Name              418 non-null    object 
 3   Sex               418 non-null    object 
 4   Age               418 non-null    float64
 5   SibSp             418 non-null    int64  
 6   Parch             418 non-null    int64  
 7   Ticket            418 non-null    object 
 8   Fare              417 non-null    float64
 9   Cabin             91 non-null     object 
 10  Embarked          418 non-null    object 
 11  Title             418 non-null    object 
 12  Deck              418 non-null    object 
 13  Ticket_Frequency  418 non-null    int64  
 14  Family            418 non-null    int64  
dtypes: float64(2), int64(6), object(7)
memory usage: 49.1+ KB


## Encoding the Categorical columns in test_dataset

In [62]:
Sex_test = pd.get_dummies(test_dataset["Sex"], drop_first=True)
Embark_test = pd.get_dummies(test_dataset["Embarked"], drop_first=True)
Title_test = pd.get_dummies(test_dataset["Title"], drop_first=True)
Deck_test = pd.get_dummies(test_dataset["Deck"], drop_first=True)

In [63]:
test_dataset = pd.concat([test_dataset, Sex_test, Embark_test, Title_test, Deck_test], axis=1)

In [64]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 29 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PassengerId       418 non-null    int64  
 1   Pclass            418 non-null    int64  
 2   Name              418 non-null    object 
 3   Sex               418 non-null    object 
 4   Age               418 non-null    float64
 5   SibSp             418 non-null    int64  
 6   Parch             418 non-null    int64  
 7   Ticket            418 non-null    object 
 8   Fare              417 non-null    float64
 9   Cabin             91 non-null     object 
 10  Embarked          418 non-null    object 
 11  Title             418 non-null    object 
 12  Deck              418 non-null    object 
 13  Ticket_Frequency  418 non-null    int64  
 14  Family            418 non-null    int64  
 15  male              418 non-null    uint8  
 16  Q                 418 non-null    uint8  
 1

### Deleting the useless columns

In [65]:
test_dataset.drop(["Name", "Ticket", "Cabin", "Embarked", "Title", "Deck", "SibSp", "Parch", "Sex"], axis =1, inplace=True)

In [66]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PassengerId       418 non-null    int64  
 1   Pclass            418 non-null    int64  
 2   Age               418 non-null    float64
 3   Fare              417 non-null    float64
 4   Ticket_Frequency  418 non-null    int64  
 5   Family            418 non-null    int64  
 6   male              418 non-null    uint8  
 7   Q                 418 non-null    uint8  
 8   S                 418 non-null    uint8  
 9   Miss              418 non-null    uint8  
 10  Mr                418 non-null    uint8  
 11  Mrs               418 non-null    uint8  
 12  Officer           418 non-null    uint8  
 13  B                 418 non-null    uint8  
 14  C                 418 non-null    uint8  
 15  D                 418 non-null    uint8  
 16  E                 418 non-null    uint8  
 1

In [67]:
## Filling the Fare column with the most Frequent Value

In [68]:
median_fare = test_dataset["Fare"].median()

In [69]:
test_dataset["Fare"].fillna(median_fare, inplace=True)

In [70]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PassengerId       418 non-null    int64  
 1   Pclass            418 non-null    int64  
 2   Age               418 non-null    float64
 3   Fare              418 non-null    float64
 4   Ticket_Frequency  418 non-null    int64  
 5   Family            418 non-null    int64  
 6   male              418 non-null    uint8  
 7   Q                 418 non-null    uint8  
 8   S                 418 non-null    uint8  
 9   Miss              418 non-null    uint8  
 10  Mr                418 non-null    uint8  
 11  Mrs               418 non-null    uint8  
 12  Officer           418 non-null    uint8  
 13  B                 418 non-null    uint8  
 14  C                 418 non-null    uint8  
 15  D                 418 non-null    uint8  
 16  E                 418 non-null    uint8  
 1

In [71]:
test_dataset.head()

Unnamed: 0,PassengerId,Pclass,Age,Fare,Ticket_Frequency,Family,male,Q,S,Miss,Mr,Mrs,Officer,B,C,D,E,F,G,M
0,892,3,34.5,7.8292,1,1,1,1,0,0,1,0,0,0,0,0,0,0,0,1
1,893,3,47.0,7.0,1,2,0,0,1,0,0,1,0,0,0,0,0,0,0,1
2,894,2,62.0,9.6875,1,1,1,1,0,0,1,0,0,0,0,0,0,0,0,1
3,895,3,27.0,8.6625,1,1,1,0,1,0,1,0,0,0,0,0,0,0,0,1
4,896,3,22.0,12.2875,1,3,0,0,1,0,0,1,0,0,0,0,0,0,0,1


In [72]:
# Adding an extra column T with all the values set to 0
test_dataset["T"] = np.zeros((418,1))

In [73]:
test_dataset["Royalty"] = np.zeros((418,1))

In [74]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PassengerId       418 non-null    int64  
 1   Pclass            418 non-null    int64  
 2   Age               418 non-null    float64
 3   Fare              418 non-null    float64
 4   Ticket_Frequency  418 non-null    int64  
 5   Family            418 non-null    int64  
 6   male              418 non-null    uint8  
 7   Q                 418 non-null    uint8  
 8   S                 418 non-null    uint8  
 9   Miss              418 non-null    uint8  
 10  Mr                418 non-null    uint8  
 11  Mrs               418 non-null    uint8  
 12  Officer           418 non-null    uint8  
 13  B                 418 non-null    uint8  
 14  C                 418 non-null    uint8  
 15  D                 418 non-null    uint8  
 16  E                 418 non-null    uint8  
 1

In [75]:
test_dataset.head()

Unnamed: 0,PassengerId,Pclass,Age,Fare,Ticket_Frequency,Family,male,Q,S,Miss,...,Officer,B,C,D,E,F,G,M,T,Royalty
0,892,3,34.5,7.8292,1,1,1,1,0,0,...,0,0,0,0,0,0,0,1,0.0,0.0
1,893,3,47.0,7.0,1,2,0,0,1,0,...,0,0,0,0,0,0,0,1,0.0,0.0
2,894,2,62.0,9.6875,1,1,1,1,0,0,...,0,0,0,0,0,0,0,1,0.0,0.0
3,895,3,27.0,8.6625,1,1,1,0,1,0,...,0,0,0,0,0,0,0,1,0.0,0.0
4,896,3,22.0,12.2875,1,3,0,0,1,0,...,0,0,0,0,0,0,0,1,0.0,0.0


## Creating Dependent and Independent Variables

In [76]:
Test_X = test_dataset.drop('PassengerId', axis=1).values

In [77]:
X = train_dataset.drop("Survived", axis=1).values
y = train_dataset['Survived'].values

# Scaling the Dataset

In [78]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit_transform(X)
scaler.transform(Test_X)

array([[ 0.82737724,  0.44133029, -0.49078316, ..., 14.89127261,
        -1.835115  , -0.03352008],
       [ 0.82737724,  1.29093871, -0.50747884, ..., 14.89127261,
        -1.835115  , -0.03352008],
       [-0.36936484,  2.31046881, -0.45336687, ..., 14.89127261,
        -1.835115  , -0.03352008],
       ...,
       [ 0.82737724,  0.71320498, -0.50244517, ..., 14.89127261,
        -1.835115  , -0.03352008],
       [ 0.82737724, -0.27234079, -0.48633742, ..., 14.89127261,
        -1.835115  , -0.03352008],
       [ 0.82737724, -1.29187089, -0.19824428, ..., 14.89127261,
        -1.835115  , -0.03352008]])

## Fitting the Model

In [79]:
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(n_estimators=200, min_samples_leaf=3, max_features=0.5, n_jobs=-1)
forest_clf.fit(X, y)

RandomForestClassifier(max_features=0.5, min_samples_leaf=3, n_estimators=200,
                       n_jobs=-1)

In [80]:
Test_Y = forest_clf.predict(Test_X) # Predicting on the Final Test Data for submissions.
passenger_id = test_dataset.PassengerId
Final = pd.DataFrame({'PassengerId': passenger_id, 'Survived': Test_Y})
Final.to_csv('Final_submission.csv', index = False)

In [81]:
from xgboost import XGBClassifier
xg_classifier = XGBClassifier()
xg_classifier.fit(X, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [82]:
Test_Y = xg_classifier.predict(Test_X) # Predicting on the Final Test Data for submissions.
passenger_id = test_dataset.PassengerId
Final = pd.DataFrame({'PassengerId': passenger_id, 'Survived': Test_Y})
Final.to_csv('xg_classifier_submission1.csv', index = False)

In [83]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [84]:
Test_Y = log_reg.predict(Test_X) # Predicting on the Final Test Data for submissions.
passenger_id = test_dataset.PassengerId
Final = pd.DataFrame({'PassengerId': passenger_id, 'Survived': Test_Y})
Final.to_csv('log_reg1.csv', index = False)