In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder,StandardScaler
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedGroupKFold

import string
import warnings
warnings.filterwarnings('ignore')

SEED = 42


In [2]:
def concat_df(train_data, test_data):
    return pd.concat([train_data, test_data], sort=True).reset_index(drop=True)
# This code combines the two DataFrames (train_data and test_data) vertically, stacking the rows.
# sort=True arranges the columns in alphabetical order in the resulting DataFrame.
# reset_index(drop=True) resets the indices to avoid duplicates and creates a continuous index.

def divide_df(all_data):
    return all_data[:890], all_data[891:].drop(['Survived'], axis=1)
# Here, the combined DataFrame (all_data) is split into two again:
# - all_data[:890]: selects rows from 0 to 890 (training set).
# - all_data[891:]: selects rows from 891 onwards (test set) and removes the 'Survived' column.
# The parameter axis=1 indicates that the removal is done on columns.

df_train = pd.read_csv('docs/train.csv')
df_test = pd.read_csv('docs/test.csv')
df_all = concat_df(df_train, df_test)

df_train.name = 'Training Set'
df_test.name = 'Test Set'
df_all.name = 'All Set'

dfs = [df_train, df_test]

print('Number of Training Examples = {}'.format(df_train.shape[0]))
print('Number of Test Examples = {}\n'.format(df_test.shape[0]))
print('Training X Shape = {}'.format(df_train.shape))
print('Training y Shape = {}\n'.format(df_train['Survived'].shape[0]))
print('Test X Shape = {}'.format(df_test.shape))
print('Test y Shape = {}\n'.format(df_test.shape[0]))
print(df_train.columns)
print(df_test.columns)

Number of Training Examples = 891
Number of Test Examples = 418

Training X Shape = (891, 12)
Training y Shape = 891

Test X Shape = (418, 11)
Test y Shape = 418

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


PassengerId is the only column that will not affect the target (survival prediction model). It is just an identifier and has no predictive power for survival.

Survived is the target variable we will use for prediction:

0 = Not Survived
1 = Survived
Pclass (Passenger Class) represents the socio-economic status of the passenger and is categorized into 3 levels:

1 = Upper Class
2 = Middle Class
3 = Lower Class
SibSp is the total number of siblings and spouses aboard the Titanic for each passenger.

Parch is the total number of parents and children aboard the Titanic for each passenger.

Ticket is the ticket number assigned to the passenger. This could be used as a unique identifier for the passenger but doesn't typically contain predictive power directly.

Fare is the amount paid by the passenger for their ticket. This is a continuous variable that could help in predicting survival, as wealthier passengers may have had better chances of survival.

Cabin is the number of the cabin assigned to the passenger. Many values in this column may be missing or incomplete, but it can still be valuable if cleaned and encoded appropriately.

Embarked is the port where the passenger boarded the Titanic, categorized into 3 unique values:

C = Cherbourg
Q = Queenstown
S = Southampton

In [8]:
print(df_train.info())
df_train.sample(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
266,267,0,3,"Panula, Mr. Ernesti Arvid",male,16.0,4,1,3101295,39.6875,,S
276,277,0,3,"Lindblom, Miss. Augusta Charlotta",female,45.0,0,0,347073,7.75,,S
135,136,0,2,"Richard, Mr. Emile",male,23.0,0,0,SC/PARIS 2133,15.0458,,C


In [9]:
print(df_test.info())
df_test.sample(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB
None


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
320,1212,3,"Andersson, Mr. Johan Samuel",male,26.0,0,0,347075,7.775,,S
260,1152,3,"de Messemaeker, Mr. Guillaume Joseph",male,36.5,1,0,345572,17.4,,S
203,1095,2,"Quick, Miss. Winifred Vera",female,8.0,1,1,26360,26.0,,S


As we can see, some columns in the dataset have missing values:

In the training dataset, the columns with missing values are Cabin, Age, and Embarked.
In the test dataset, the columns with missing values are Cabin, Age, and Fare.
It is often convenient to work with the concatenated training and test datasets while handling missing values. This approach is preferred because it allows for consistency when filling missing values across both datasets. If you handle missing values separately for each dataset, the filled data might become overly tailored (or overfitted) to the training or test set.

Missing values in Age, Embarked, and Fare are less problematic because the number of missing values is relatively small compared to the total dataset. These columns can be filled with statistical measures (such as the median or mean).

However, Cabin has around 80% of its data missing. This amount of missing data cannot be reasonably filled with simple statistical measures like the median or mean. More complex methods, such as imputation based on similar records, or even dropping the column entirely, may be required.

So, the strategy for filling missing values depends on the column and the extent of missing data:

For columns with a small proportion of missing data, filling with the mean, median, or mode is a viable option.
For columns with a high percentage of missing values (like Cabin), alternative approaches such as predictive modeling or excluding the column may be more appropriate.

In [11]:
def display_missing(df):  # Defines the function that takes a DataFrame as an argument.
    for col in df.columns.tolist():  # Iterates through all columns of the DataFrame, converting df.columns to a list of column names.
        print('{} columns missing values: {}'.format(col, df[col].isnull().sum()))  
        # Prints the column name and the number of missing values in that column (using isnull() to check for null values and .sum() to count the nulls).
        print('\n')  # Prints a blank line to separate the results for each column.

for df in dfs:  # Loop that iterates through all the DataFrames in the 'dfs' list.
    print('{}'.format(df.name))  # Prints the name of the current DataFrame (assuming 'df.name' has been defined earlier).
    display_missing(df)  # Calls the display_missing function to check and show the missing values for each column of the DataFrame.

Training Set
PassengerId columns missing values: 0


Survived columns missing values: 0


Pclass columns missing values: 0


Name columns missing values: 0


Sex columns missing values: 0


Age columns missing values: 177


SibSp columns missing values: 0


Parch columns missing values: 0


Ticket columns missing values: 0


Fare columns missing values: 0


Cabin columns missing values: 687


Embarked columns missing values: 2


Test Set
PassengerId columns missing values: 0


Pclass columns missing values: 0


Name columns missing values: 0


Sex columns missing values: 0


Age columns missing values: 86


SibSp columns missing values: 0


Parch columns missing values: 0


Ticket columns missing values: 0


Fare columns missing values: 1


Cabin columns missing values: 327


Embarked columns missing values: 0




In [3]:
# Create a copy of the original DataFrame to preserve the original data and avoid direct changes
df_copy = df_all.copy()

# Filter the columns of the DataFrame to include only numeric columns (excluding strings, objects, etc.)
df_copy = df_copy.select_dtypes(include=[np.number])

# Calculate the absolute correlation between all numeric columns, organize the correlation values into column pairs
# and sort them in descending order of correlation
df_all_corr = df_copy.corr().abs().unstack().sort_values(kind="quicksort", ascending=False).reset_index()

# Rename the columns of the correlation DataFrame to make it more readable
# 'level_0' and 'level_1' become 'Feature 1' and 'Feature 2', and the correlation is called 'Correlation Coefficient'
df_all_corr.rename(columns={"level_0": "Feature 1", "level_1": "Feature 2", 0: 'Correlation Coefficient'}, inplace=True)

# Filter the DataFrame to show only the correlations where "Feature 1" is the 'Age' column
df_all_corr[df_all_corr['Feature 1'] == 'Age']

Unnamed: 0,Feature 1,Feature 2,Correlation Coefficient
0,Age,Age,1.0
10,Age,Pclass,0.408106
17,Age,SibSp,0.243699
22,Age,Fare,0.17874
25,Age,Parch,0.150917
29,Age,Survived,0.077221
41,Age,PassengerId,0.028814


In [4]:
# Calculate the median age for each combination of 'Sex' and 'Pclass'
age_by_pclass_sex = df_all.groupby(['Sex', 'Pclass'])['Age'].median()

# Iterate over combinations of 'Pclass' and 'Sex' to display the medians
for pclass in range(1, 4):
    for sex in ['female', 'male']:
        print(f'Median age of Pclass {pclass} {sex}s: {age_by_pclass_sex.loc[sex, pclass]}')
print(f'Median age of all passengers: {df_all["Age"].median()}')

# Fill the missing values in 'Age' with the median age from each 'Sex' and 'Pclass' group
df_all['Age'] = df_all.groupby(['Sex', 'Pclass'])['Age'].transform(lambda x: x.fillna(x.median()))



Median age of Pclass 1 females: 36.0
Median age of Pclass 1 males: 42.0
Median age of Pclass 2 females: 28.0
Median age of Pclass 2 males: 29.5
Median age of Pclass 3 females: 22.0
Median age of Pclass 3 males: 25.0
Median age of all passengers: 28.0


Embarked is a categorical feature, and there are only 2 missing values in the entire dataset. Both passengers are female, from the upper class, and have the same ticket number. This means that they know each other and boarded together at the same port. The mode of embarkation for an upper-class female passenger is C (Cherbourg), but this does not necessarily mean that she boarded at that port.

In [26]:
df_all[df_all['Embarked'].isnull()]

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
61,38.0,B28,,80.0,"Icard, Miss. Amelie",0,62,1,female,0,1.0,113572
829,62.0,B28,,80.0,"Stone, Mrs. George Nelson (Martha Evelyn)",0,830,1,female,0,1.0,113572


In [27]:
df_all[df_all['Fare'].isnull()]

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
1043,60.5,,S,,"Storey, Mr. Thomas",0,1044,3,male,0,,3701


In [35]:
med_fare = df_all.groupby(['Pclass','SibSp','Parch']).Fare.median()[3][0][0]

df_all['Fare'] = df_all['Fare'].fillna(med_fare)

In [5]:
# Check for missing values in the 'Embarked' column after imputing
df_all['Fare'].isnull().sum()

np.int64(1)