#### For Kaggle competitions it is common practice to combine test and train data before preprocessing (imputing missing values).This allows you to have more data from the same pool. Reduces some repetition of tasks. And takes care of categories that might appear in the test data but not in the train data.

3 changes to further optimize model:
1. Use test+train data to impute missing train Data
2. Linear Regression to impute missing Age data
3. Boosting/XGBoost
4. New Feature Family Size
5. Bunch of Classification Algorithms
6. Split train into test-train for validation or use sklearn's cross_val_score.
7. Age missing sample has lower survival rate than Age not missing sample. So we might want to create a new column with binary information abnout Age missing.



# Titanic - Data Cleaning and Exploration

## Import Libraries

In [1291]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

## Import Data

In [1292]:
raw_train=pd.read_csv('train.csv')
raw_test=pd.read_csv('test.csv')

In [1293]:
train=raw_train
test=raw_test

## Inspection

In [1294]:
#print training dataset
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [1295]:
#Get Summary Stats
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [1296]:
#Get information about the Dataframe
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


## Cleaning

### Irrelevant Data

In [1297]:
#Remove irrelevant Fields and Rows

### Duplicates

In [1298]:
#Find and Remove Duplicated Rows if appropriate.

In [1299]:
train[train.duplicated()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked


### Data Type Constraints

In [1300]:
#Values in a particular column must be of a particular datatype

In [1301]:
#Find the Data types (alternative to info())
train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

### Unit Uniformity

In [1302]:
#Ensure data is specified using the same unit of measure.

### Range Constraints

In [1303]:
#Check if values are in appropirate Range
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [1304]:
#Most seem ok. Just checking minimum for Age.

train.sort_values(by=['Age'])

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
803,804,1,3,"Thomas, Master. Assad Alexander",male,0.42,0,1,2625,8.5167,,C
755,756,1,2,"Hamalainen, Master. Viljo",male,0.67,1,1,250649,14.5000,,S
644,645,1,3,"Baclini, Miss. Eugenie",female,0.75,2,1,2666,19.2583,,C
469,470,1,3,"Baclini, Miss. Helene Barbara",female,0.75,2,1,2666,19.2583,,C
78,79,1,2,"Caldwell, Master. Alden Gates",male,0.83,0,2,248738,29.0000,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S


In [1305]:
#Seems Acceptable.

### Unique Constraints

In [1306]:
#A field, or a combination of fields, must be unique across a dataset

In [1307]:
#Find number of Unique values in each column
train.nunique()

PassengerId    891
Survived         2
Pclass           3
Name           891
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          147
Embarked         3
dtype: int64

In [1308]:
#As expected, PassengerID is Unique.

### Set-Membership Constraint

In [1309]:
#Values of a column come from a set of discrete values

In [1310]:
#Find the Unique Values in Pclass
train['Pclass'].unique()

array([3, 1, 2], dtype=int64)

In [1311]:
#We can run similar checks for other set-membership constrained fields such as Embarked and Sex.
#But only necessary if train.nunique() gives unexpected numbers.

### Spaces

In [1312]:
#Remove Leading and Trailing Spaces

### Pad Strings

In [1313]:
#For example, some numerical codes are often represented with prepending zeros to ensure they always have 
#the same number of digits.

### Fix Typos

In [1314]:
#Manual Mapping, Pattern Matching or Fuzzy Matching

### Foreign-key constraints

In [1315]:
#As in relational databases, a foreign key column can’t have a value that does not exist in the referenced primary key.

### Regular expression patterns

In [1316]:
#text fields that have to be in a certain pattern. For example, phone numbers may be required to have the pattern (999) 999–9999.

### Cross-field validation

In [1317]:
#Certain conditions that span across multiple fields must hold. For example, a patient’s date of discharge from the hospital cannot be earlier than the date of admission.

### Missing Data

In [1318]:
#Use one of the many approaches to deal with Missing Data

In [1319]:
#Mandatory Constraint: Certain Columns cannot be empty

In [1320]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [1321]:
#alternative
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [1322]:
#Check the two rows with missing Embarked values
train[pd.isnull(train['Embarked'])]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [1323]:
#Out of all the fields, only Pclass could be related to which port a passenger Embarked from if those three ports 
#are of differnt economic demographics.
#Check the distribution of Embarked across Pclass to see if Pclass 1 predominantly from a specific port.

train.groupby(['Pclass','Embarked']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,PassengerId,Survived,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin
Pclass,Embarked,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,C,85,85,85,85,74,85,85,85,85,66
1,Q,2,2,2,2,2,2,2,2,2,2
1,S,127,127,127,127,108,127,127,127,127,106
2,C,17,17,17,17,15,17,17,17,17,2
2,Q,3,3,3,3,2,3,3,3,3,1
2,S,164,164,164,164,156,164,164,164,164,13
3,C,66,66,66,66,41,66,66,66,66,1
3,Q,72,72,72,72,24,72,72,72,72,1
3,S,353,353,353,353,290,353,353,353,353,10


In [1324]:
#Given that the two passengers are Pclass 1, they are most likely to have embarked from S.

In [1325]:
#formally - the mode by Pclass
train.groupby('Pclass').agg(pd.Series.mode)

Unnamed: 0_level_0,Survived,Sex,SibSp,Parch,Fare,Embarked
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,male,0,0,26.55,S
2,0,male,0,0,13.0,S
3,0,male,0,0,8.05,S


In [1326]:
#However the mode for Pclass 1 is the same as the mode for the whole data.
#We assign the modal value to the missing Embarked data (the [0] after mode because mode is a pandas series)
mode=train['Embarked'].mode()
train['Embarked'].fillna(mode[0],inplace=True)

In [1327]:
#Show the replaced values
train[train['PassengerId'].isin([62,830])]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,S
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,S


In [1328]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [1329]:
#Looking at the Correlation Matrix below, it seems Age is correlated with Pclass
train.corr()

#You could also encode other categorical variables to seem them in the matrix

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


In [1330]:
#SibSp and Parch are also correlated with Age.
#We could use Regression with Age as target and Pclass, Parch and SibSp as Features.
#https://towardsdatascience.com/predict-missing-values-in-the-dataset-897912a54b7b

#But for now we will simple assign the mean age for the Pclass of the missing data rows

In [1331]:
#Average age different for different Pclass
train.groupby('Pclass').mean()

Unnamed: 0_level_0,PassengerId,Survived,Age,SibSp,Parch,Fare
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,461.597222,0.62963,38.233441,0.416667,0.356481,84.154687
2,445.956522,0.472826,29.87763,0.402174,0.380435,20.662183
3,439.154786,0.242363,25.14062,0.615071,0.393075,13.67555


In [1332]:
#Replace Missing Vales of Age with Mean by Pclass
train['Age'] = train['Age'].fillna(train.groupby(['Pclass'])['Age'].transform('mean'))


#for test data
test['Age'] = test['Age'].fillna(train.groupby(['Pclass'])['Age'].transform('mean'))


In [1333]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [1334]:
#Replace Missing Values for Fare in test Data by Mean of Fare in train data
test['Fare'] = test['Fare'].fillna(train['Fare'].mean())

In [1335]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [1336]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64

In [1337]:
#Design New Feature
train['Cabin_Class'] = train['Cabin'].str[0]

#for test
test['Cabin_Class'] = test['Cabin'].str[0]

In [1338]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Cabin_Class
0,1,0,3,"Braund, Mr. Owen Harris",male,22.00000,1,0,A/5 21171,7.2500,,S,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.00000,1,0,PC 17599,71.2833,C85,C,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.00000,0,0,STON/O2. 3101282,7.9250,,S,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.00000,1,0,113803,53.1000,C123,S,C
4,5,0,3,"Allen, Mr. William Henry",male,35.00000,0,0,373450,8.0500,,S,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.00000,0,0,211536,13.0000,,S,
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.00000,0,0,112053,30.0000,B42,S,B
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,25.14062,1,2,W./C. 6607,23.4500,,S,
889,890,1,1,"Behr, Mr. Karl Howell",male,26.00000,0,0,111369,30.0000,C148,C,C


In [1339]:
pd.crosstab(train['Cabin_Class'], train['Pclass'], margins = False)

Pclass,1,2,3
Cabin_Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,15,0,0
B,47,0,0
C,59,0,0
D,29,4,0
E,25,4,3
F,0,8,5
G,0,0,4
T,1,0,0


In [1340]:
#Average Fares per Cabin Class
train.groupby(['Cabin_Class']).mean()

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
Cabin_Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A,439.6,0.466667,1.0,43.513355,0.133333,0.133333,39.623887
B,521.808511,0.744681,1.0,35.09504,0.361702,0.574468,113.505764
C,406.440678,0.59322,1.0,36.377755,0.644068,0.474576,100.151341
D,475.939394,0.757576,1.121212,38.983845,0.424242,0.30303,57.244576
E,502.4375,0.75,1.3125,37.862846,0.3125,0.3125,46.026694
F,370.384615,0.615385,2.384615,20.752403,0.538462,0.538462,18.696792
G,216.0,0.5,3.0,14.75,0.5,1.25,13.58125
T,340.0,0.0,1.0,45.0,0.0,0.0,35.5


In [1341]:
#Replace the Missing values in Cabin_Class with "U" for Unknown
train['Cabin_Class'].fillna('U',inplace=True)


#for test
test['Cabin_Class'].fillna('U',inplace=True)

### One Hot Ecoding Emabrked and Cabin_Class 

In [1342]:
#Splitting Y from X before One Hot Encoding to ensure similar Encoding Dictionary and column numbers for train and test

y_train = train['Survived']
y_train


0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [1343]:
train.drop(columns=['Survived'], inplace=True)
train

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Cabin_Class
0,1,3,"Braund, Mr. Owen Harris",male,22.00000,1,0,A/5 21171,7.2500,,S,U
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.00000,1,0,PC 17599,71.2833,C85,C,C
2,3,3,"Heikkinen, Miss. Laina",female,26.00000,0,0,STON/O2. 3101282,7.9250,,S,U
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.00000,1,0,113803,53.1000,C123,S,C
4,5,3,"Allen, Mr. William Henry",male,35.00000,0,0,373450,8.0500,,S,U
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",male,27.00000,0,0,211536,13.0000,,S,U
887,888,1,"Graham, Miss. Margaret Edith",female,19.00000,0,0,112053,30.0000,B42,S,B
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,25.14062,1,2,W./C. 6607,23.4500,,S,U
889,890,1,"Behr, Mr. Karl Howell",male,26.00000,0,0,111369,30.0000,C148,C,C


In [1344]:
#Create Instance and Fit
# drop_lastto return k-1, false to return k
ohe = OneHotEncoder(top_categories=None, variables=['Cabin_Class', 'Embarked'], drop_last=True)
ohe.fit(train)

#for test
ohe.fit(test)

OneHotEncoder(drop_last=True, variables=['Cabin_Class', 'Embarked'])

In [1345]:
#Transform
temp = ohe.transform(train)
train=temp
train

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Cabin_Class_U,Cabin_Class_B,Cabin_Class_E,Cabin_Class_A,Cabin_Class_C,Cabin_Class_D,Cabin_Class_F,Embarked_Q,Embarked_S
0,1,3,"Braund, Mr. Owen Harris",male,22.00000,1,0,A/5 21171,7.2500,,1,0,0,0,0,0,0,0,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.00000,1,0,PC 17599,71.2833,C85,0,0,0,0,1,0,0,0,0
2,3,3,"Heikkinen, Miss. Laina",female,26.00000,0,0,STON/O2. 3101282,7.9250,,1,0,0,0,0,0,0,0,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.00000,1,0,113803,53.1000,C123,0,0,0,0,1,0,0,0,1
4,5,3,"Allen, Mr. William Henry",male,35.00000,0,0,373450,8.0500,,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",male,27.00000,0,0,211536,13.0000,,1,0,0,0,0,0,0,0,1
887,888,1,"Graham, Miss. Margaret Edith",female,19.00000,0,0,112053,30.0000,B42,0,1,0,0,0,0,0,0,1
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,25.14062,1,2,W./C. 6607,23.4500,,1,0,0,0,0,0,0,0,1
889,890,1,"Behr, Mr. Karl Howell",male,26.00000,0,0,111369,30.0000,C148,0,0,0,0,1,0,0,0,0


In [1346]:
#for test
temp2 = ohe.transform(test)

test=temp2
test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Cabin_Class_U,Cabin_Class_B,Cabin_Class_E,Cabin_Class_A,Cabin_Class_C,Cabin_Class_D,Cabin_Class_F,Embarked_Q,Embarked_S
0,892,3,"Kelly, Mr. James",male,34.50000,0,0,330911,7.8292,,1,0,0,0,0,0,0,1,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.00000,1,0,363272,7.0000,,1,0,0,0,0,0,0,0,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.00000,0,0,240276,9.6875,,1,0,0,0,0,0,0,1,0
3,895,3,"Wirz, Mr. Albert",male,27.00000,0,0,315154,8.6625,,1,0,0,0,0,0,0,0,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.00000,1,1,3101298,12.2875,,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,29.87763,0,0,A.5. 3236,8.0500,,1,0,0,0,0,0,0,0,1
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.00000,0,0,PC 17758,108.9000,C105,0,0,0,0,1,0,0,0,0
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.50000,0,0,SOTON/O.Q. 3101262,7.2500,,1,0,0,0,0,0,0,0,1
416,1308,3,"Ware, Mr. Frederick",male,29.87763,0,0,359309,8.0500,,1,0,0,0,0,0,0,0,1


In [1347]:
train.isnull().sum()

PassengerId        0
Pclass             0
Name               0
Sex                0
Age                0
SibSp              0
Parch              0
Ticket             0
Fare               0
Cabin            687
Cabin_Class_U      0
Cabin_Class_B      0
Cabin_Class_E      0
Cabin_Class_A      0
Cabin_Class_C      0
Cabin_Class_D      0
Cabin_Class_F      0
Embarked_Q         0
Embarked_S         0
dtype: int64

In [1348]:
#Cabin Feature does not carry much information since many Distinct values and majority values missing.
#Therefore, we drop it along with the other uninformative columns.

train.drop(columns=['Cabin', 'Ticket', 'Name', 'PassengerId'], inplace=True)
train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin_Class_U,Cabin_Class_B,Cabin_Class_E,Cabin_Class_A,Cabin_Class_C,Cabin_Class_D,Cabin_Class_F,Embarked_Q,Embarked_S
0,3,male,22.00000,1,0,7.2500,1,0,0,0,0,0,0,0,1
1,1,female,38.00000,1,0,71.2833,0,0,0,0,1,0,0,0,0
2,3,female,26.00000,0,0,7.9250,1,0,0,0,0,0,0,0,1
3,1,female,35.00000,1,0,53.1000,0,0,0,0,1,0,0,0,1
4,3,male,35.00000,0,0,8.0500,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,2,male,27.00000,0,0,13.0000,1,0,0,0,0,0,0,0,1
887,1,female,19.00000,0,0,30.0000,0,1,0,0,0,0,0,0,1
888,3,female,25.14062,1,2,23.4500,1,0,0,0,0,0,0,0,1
889,1,male,26.00000,0,0,30.0000,0,0,0,0,1,0,0,0,0


In [1349]:
#For test
test.drop(columns=['Cabin', 'Ticket', 'Name', 'PassengerId'], inplace=True)
test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin_Class_U,Cabin_Class_B,Cabin_Class_E,Cabin_Class_A,Cabin_Class_C,Cabin_Class_D,Cabin_Class_F,Embarked_Q,Embarked_S
0,3,male,34.50000,0,0,7.8292,1,0,0,0,0,0,0,1,0
1,3,female,47.00000,1,0,7.0000,1,0,0,0,0,0,0,0,1
2,2,male,62.00000,0,0,9.6875,1,0,0,0,0,0,0,1,0
3,3,male,27.00000,0,0,8.6625,1,0,0,0,0,0,0,0,1
4,3,female,22.00000,1,1,12.2875,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,3,male,29.87763,0,0,8.0500,1,0,0,0,0,0,0,0,1
414,1,female,39.00000,0,0,108.9000,0,0,0,0,1,0,0,0,0
415,3,male,38.50000,0,0,7.2500,1,0,0,0,0,0,0,0,1
416,3,male,29.87763,0,0,8.0500,1,0,0,0,0,0,0,0,1


### Check for Multicollinearity

In [1350]:
corr = train.corr().abs()

s = corr.unstack()
s

Pclass      Pclass           1.000000
            Age              0.403858
            SibSp            0.083081
            Parch            0.018443
            Fare             0.549500
                               ...   
Embarked_S  Cabin_Class_C    0.068502
            Cabin_Class_D    0.052254
            Cabin_Class_F    0.033010
            Embarked_Q       0.499421
            Embarked_S       1.000000
Length: 196, dtype: float64

In [1351]:
so = pd.DataFrame(s.sort_values(kind="quicksort",ascending=False))
so[so[0]!=1].head(10)

Unnamed: 0,Unnamed: 1,0
Pclass,Cabin_Class_U,0.725541
Cabin_Class_U,Pclass,0.725541
Pclass,Fare,0.5495
Fare,Pclass,0.5495
Embarked_Q,Embarked_S,0.499421
Embarked_S,Embarked_Q,0.499421
Cabin_Class_U,Cabin_Class_C,0.488683
Cabin_Class_C,Cabin_Class_U,0.488683
Cabin_Class_U,Fare,0.482075
Fare,Cabin_Class_U,0.482075


In [1352]:
#The top 4 might be something to worry about but makes sense that they are the hot encoded features

In [1353]:
#In the General Purpose Template: Include the SimpleImputer process from Lec 21 of ML A-Z. It seems to work only if
#more than one variable. You can reshape you array to match the requirements - See the last line of the error

## Exploratory Analysis

# ?

In [1290]:
#Change this to a cluster Bar graph showing survived and dead by gender. Also is this the best way to print a bar graph?
categories = data['Sex'].value_counts().index
counts = data['Sex'].value_counts().values
plt.bar(categories, counts, width=0.5)

NameError: name 'data' is not defined

In [None]:
#Q:Looking at a Crosstab between Survived and Gender to test the expectation that greater percentage of females survived

pd.crosstab(train['Sex'], train['Survived'], margins = False, normalize= 'index')

In [None]:
#A: Males have a measly 19% probability of surviving while females have 74%. 

In [None]:
#Q: We could create the simplest model of 'All Females survived and All Males died' as below. 
#Of course, the accuracy would be terrible.

test1=test
test1['Prediction']=np.where(test1['Sex']=='female',1,0)

In [None]:
#Check whether PClass relates to Survival

train.groupby('Pclass').mean()

In [None]:
#63% probability of survival for Pclass 1 vs 24% for Pclass 3

In [None]:
#Find the Unique Values in Pclass with their Counts
train['Pclass'].value_counts(dropna=False)

In [None]:
#Find the Unique Values in Pclass with their Counts (alternative)
train.groupby('Pclass').count()

### Outliers

In [None]:
#Any data value that lies more than (1.5 * IQR) away from the Q1 and Q3 quartiles is considered an outlier. 
#Outliers are innocent until proven guilty. 
#With that being said, they should not be removed unless there is a good reason for that.

In [None]:
#Scatter Plot of Age and Fare to check if older people buy expensive tickets

plt.scatter(train['Age'], train['Fare'])
plt.xlabel('Age')
plt.ylabel('Fare')
plt.show()

In [None]:
#Show the outliers in Fare
train[train['Fare']>300]

In [None]:
train[train['Ticket']=='110152']

In [None]:
train[train['Cabin']=='B96 B98']

In [None]:
Counts[Counts['PassengerId']>1].sort_values(by='PassengerId', ascending=False)

## Preparing Data for ML

### Encoding Categorical Variables

In [1354]:
#Embarked and Cabin_Class already One Hot Encoded

In [1355]:
#Sex is binary - so Ordinal Encoding
# Using Feature Engine because it allows us to code multiple variables at a time (sklearn doesn't) and allows us to 
#view the dictionary later (pandas doesn't)

from feature_engine.encoding import OrdinalEncoder

In [1356]:
ordinal_enc = OrdinalEncoder(encoding_method='arbitrary', variables=['Sex'])

ordinal_enc.fit(train)

OrdinalEncoder(encoding_method='arbitrary', variables=['Sex'])

In [1357]:
# in the encoder dict we can observe the numbers
# assigned to each category for all the indicated variables

ordinal_enc.encoder_dict_

{'Sex': {'male': 0, 'female': 1}}

In [1358]:
# this is the list of variables that the encoder will transform

ordinal_enc.variables_

['Sex']

In [1359]:
#Transform Train
train = ordinal_enc.transform(train)

train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin_Class_U,Cabin_Class_B,Cabin_Class_E,Cabin_Class_A,Cabin_Class_C,Cabin_Class_D,Cabin_Class_F,Embarked_Q,Embarked_S
0,3,0,22.00000,1,0,7.2500,1,0,0,0,0,0,0,0,1
1,1,1,38.00000,1,0,71.2833,0,0,0,0,1,0,0,0,0
2,3,1,26.00000,0,0,7.9250,1,0,0,0,0,0,0,0,1
3,1,1,35.00000,1,0,53.1000,0,0,0,0,1,0,0,0,1
4,3,0,35.00000,0,0,8.0500,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,2,0,27.00000,0,0,13.0000,1,0,0,0,0,0,0,0,1
887,1,1,19.00000,0,0,30.0000,0,1,0,0,0,0,0,0,1
888,3,1,25.14062,1,2,23.4500,1,0,0,0,0,0,0,0,1
889,1,0,26.00000,0,0,30.0000,0,0,0,0,1,0,0,0,0


In [1360]:
#Transform Test using same Encoder object

test = ordinal_enc.transform(test)

test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin_Class_U,Cabin_Class_B,Cabin_Class_E,Cabin_Class_A,Cabin_Class_C,Cabin_Class_D,Cabin_Class_F,Embarked_Q,Embarked_S
0,3,0,34.50000,0,0,7.8292,1,0,0,0,0,0,0,1,0
1,3,1,47.00000,1,0,7.0000,1,0,0,0,0,0,0,0,1
2,2,0,62.00000,0,0,9.6875,1,0,0,0,0,0,0,1,0
3,3,0,27.00000,0,0,8.6625,1,0,0,0,0,0,0,0,1
4,3,1,22.00000,1,1,12.2875,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,3,0,29.87763,0,0,8.0500,1,0,0,0,0,0,0,0,1
414,1,1,39.00000,0,0,108.9000,0,0,0,0,1,0,0,0,0
415,3,0,38.50000,0,0,7.2500,1,0,0,0,0,0,0,0,1
416,3,0,29.87763,0,0,8.0500,1,0,0,0,0,0,0,0,1


## Feature Scaling

In [1361]:
#Skipping for now since only the predictions matter and don't have to interpret the coefficients

In [1362]:
test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin_Class_U,Cabin_Class_B,Cabin_Class_E,Cabin_Class_A,Cabin_Class_C,Cabin_Class_D,Cabin_Class_F,Embarked_Q,Embarked_S
0,3,0,34.50000,0,0,7.8292,1,0,0,0,0,0,0,1,0
1,3,1,47.00000,1,0,7.0000,1,0,0,0,0,0,0,0,1
2,2,0,62.00000,0,0,9.6875,1,0,0,0,0,0,0,1,0
3,3,0,27.00000,0,0,8.6625,1,0,0,0,0,0,0,0,1
4,3,1,22.00000,1,1,12.2875,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,3,0,29.87763,0,0,8.0500,1,0,0,0,0,0,0,0,1
414,1,1,39.00000,0,0,108.9000,0,0,0,0,1,0,0,0,0
415,3,0,38.50000,0,0,7.2500,1,0,0,0,0,0,0,0,1
416,3,0,29.87763,0,0,8.0500,1,0,0,0,0,0,0,0,1


## Logistic Regression

In [1363]:
#Convert to nparrays
nptrain=train.values
nptest=test.values
y_train=y_train.values

In [1364]:
#Import Class and create Instance
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter = 1000)

In [1365]:
model.fit(nptrain,y_train)

LogisticRegression(max_iter=1000)

In [1391]:
prediction=pd.DataFrame(model.predict(nptest))

In [1392]:
prediction

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,1
...,...
413,0
414,1
415,0
416,0


### Formatting Results for Submission

In [1393]:
result = pd.read_csv('test.csv')

In [1394]:
result=result['PassengerId']

In [1395]:
result=pd.DataFrame(result)
result

Unnamed: 0,PassengerId
0,892
1,893
2,894
3,895
4,896
...,...
413,1305
414,1306
415,1307
416,1308


In [1396]:
result['Survived']=prediction

In [1399]:
result

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [1415]:
#Make sure to select index=False when saving
pd.DataFrame(result).to_csv(r"C:\Users\poorv\Poorval\Analytics\Portfolio\Classification\Titanic\Submissions\1.submission.csv",index=False)