<a href="https://colab.research.google.com/github/SkyChen1009/ML-Titanic-Survival-Prediction-with-Random-Forest/blob/main/Titanic%20Survival%20Prediction%20with%20Random%20Forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import libraries for linear algebra and data processing

In [None]:
import numpy as np  # For numerical operations on arrays
import pandas as pd  # For data manipulation and analysis

## Import libraries for data visualization

In [None]:
import seaborn as sns  # For advanced visualizations
%matplotlib inline
from matplotlib import pyplot as plt  # For plotting data
from matplotlib import style

## Import machine learning algorithms from sklearn

In [None]:
from sklearn import linear_model  # For linear regression models
from sklearn.linear_model import LogisticRegression  # For logistic regression
from sklearn.ensemble import RandomForestClassifier  # For random forest classifier
from sklearn.linear_model import Perceptron  # For Perceptron algorithm
from sklearn.linear_model import SGDClassifier  # For stochastic gradient descent
from sklearn.tree import DecisionTreeClassifier  # For decision tree classifier
from sklearn.neighbors import KNeighborsClassifier  # For k-nearest neighbors
from sklearn.svm import SVC, LinearSVC  # For support vector machines
from sklearn.naive_bayes import GaussianNB  # For naive bayes

# Google Drive Setup

### Mount Google Drive to access data:

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Load Data

### Load training, testing, and submission datasets:

In [None]:
test_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/test.csv")
train_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/train.csv")

## Check for missing data
### Summarize the total and percentage of missing data for each column

In [None]:
total = train_df.isnull().sum().sort_values(ascending=False)
percent_1 = train_df.isnull().sum()/train_df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(5)    # Display top columns with missing data

### Display column names for reference

In [None]:
train_df.columns.values

## Visualize survival by gender and age using subplots

In [None]:
survived = 'survived'
not_survived = 'not survived'
fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(10, 4))
women = train_df[train_df['Sex']=='female']
men = train_df[train_df['Sex']=='male']

## Plot age distribution for women based on survival

In [None]:
ax = sns.distplot(women[women['Survived']==1].Age.dropna(), bins=18, label = survived, ax = axes[0], kde =False)
ax = sns.distplot(women[women['Survived']==0].Age.dropna(), bins=40, label = not_survived, ax = axes[0], kde =False)
ax.legend()
ax.set_title('Female')

## Plot age distribution for men based on survival

In [None]:
ax = sns.distplot(men[men['Survived']==1].Age.dropna(), bins=18, label = survived, ax = axes[1], kde = False)
ax = sns.distplot(men[men['Survived']==0].Age.dropna(), bins=40, label = not_survived, ax = axes[1], kde = False)
ax.legend()
_ = ax.set_title('Male')

## FacetGrid

## Create a FacetGrid to plot survival rate by 'Pclass' and 'Sex', with each row representing different embarkation points

In [None]:
FacetGrid = sns.FacetGrid(train_df, row='Embarked',  aspect=1.6)
# Map a point plot to each subset of the data for visualization
FacetGrid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', palette=None,  order=None, hue_order=None )
# Add a legend to the FacetGrid for better interpretation of the plot
FacetGrid.add_legend()

## Plot survival rate by passenger class (Pclass) as a bar plot

In [None]:
sns.barplot(x='Pclass', y='Survived', data=train_df)

## Create a FacetGrid to show the age distribution, with columns by survival status and rows by passenger class (Pclass)

In [None]:
grid = sns.FacetGrid(train_df, col='Survived', row='Pclass', aspect=1.6)
# Map a histogram of 'Age' to each subplot in the grid
grid.map(plt.hist, 'Age', alpha=.5, bins=20)
# Add a legend to the grid for clearer distinction between facets
grid.add_legend()

## Encode family-related features
### Add 'relatives' column (sum of SibSp and Parch), and set 'not_alone' for passengers with no relatives

In [None]:
data = [train_df, test_df]
for dataset in data:
    dataset['relatives'] = dataset['SibSp'] + dataset['Parch']
    dataset.loc[dataset['relatives'] > 0, 'not_alone'] = 0
    dataset.loc[dataset['relatives'] == 0, 'not_alone'] = 1
    dataset['not_alone'] = dataset['not_alone'].astype(int)
train_df['not_alone'].value_counts()

In [None]:
train_df = train_df.drop(['PassengerId'], axis=1)

## Map cabin decks to numerical values, filling missing 'Cabin' data with 'U0' (unknown)

In [None]:
import re
deck = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8}
data = [train_df, test_df]

for dataset in data:
    dataset['Cabin'] = dataset['Cabin'].fillna("U0")
    dataset['Deck'] = dataset['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
    dataset['Deck'] = dataset['Deck'].map(deck)
    dataset['Deck'] = dataset['Deck'].fillna(0)
    dataset['Deck'] = dataset['Deck'].astype(int)
# we can now drop the cabin feature
# Drop the 'Cabin' column since 'Deck' information has been extracted
train_df = train_df.drop(['Cabin'], axis=1)
test_df = test_df.drop(['Cabin'], axis=1)

## Handle missing values in 'Age' column by replacing them with randomly generated values within one standard deviation

In [None]:
data = [train_df, test_df]

for dataset in data:
    mean = train_df["Age"].mean()
    std = test_df["Age"].std()
    is_null = dataset["Age"].isnull().sum()
    # compute random numbers between the mean, std and is_null
    rand_age = np.random.randint(mean - std, mean + std, size = is_null)
    # fill NaN values in Age column with random values generated
    age_slice = dataset["Age"].copy()
    age_slice[np.isnan(age_slice)] = rand_age
    dataset["Age"] = age_slice
    dataset["Age"] = train_df["Age"].astype(int)
train_df["Age"].isnull().sum()

In [None]:
train_df['Embarked'].describe()

## Replace missing 'Embarked' values with the most common value ('S')

In [None]:
common_value = 'S'
data = [train_df, test_df]

for dataset in data:
    dataset['Embarked'] = dataset['Embarked'].fillna(common_value)

In [None]:
train_df.info()

## Combine training and test datasets for consistent preprocessing

In [None]:
data = [train_df, test_df]

for dataset in data:
    # Fill any missing 'Fare' values with 0
    dataset['Fare'] = dataset['Fare'].fillna(0)
    # Convert 'Fare' column to integer type for simplicity
    dataset['Fare'] = dataset['Fare'].astype(int)

## Dictionary to map title strings to numerical codes

In [None]:
data = [train_df, test_df]
titles = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

for dataset in data:
    # extract titles
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    # replace titles with a more common title or as Rare
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr',\
                                            'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    # convert titles into numbers
    dataset['Title'] = dataset['Title'].map(titles)
    # filling NaN with 0, to get safe
    dataset['Title'] = dataset['Title'].fillna(0)

## Drop the 'Name' column since title information has been extracted and processed

In [None]:
train_df = train_df.drop(['Name'], axis=1)
test_df = test_df.drop(['Name'], axis=1)

## Encode 'Sex' feature as 0 for male and 1 for female

In [None]:
genders = {"male": 0, "female": 1}
data = [train_df, test_df]

for dataset in data:
    dataset['Sex'] = dataset['Sex'].map(genders)

In [None]:
train_df['Ticket'].describe()

## Drop irrelevant features (such as 'PassengerId' and 'Ticket') from the datasets

In [None]:
train_df = train_df.drop(['Ticket'], axis=1)
test_df = test_df.drop(['Ticket'], axis=1)

In [None]:
ports = {"S": 0, "C": 1, "Q": 2}
data = [train_df, test_df]

for dataset in data:
    dataset['Embarked'] = dataset['Embarked'].map(ports)

In [None]:
data = [train_df, test_df]
for dataset in data:
    dataset['Age'] = dataset['Age'].astype(int)
    dataset.loc[ dataset['Age'] <= 11, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 11) & (dataset['Age'] <= 18), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 18) & (dataset['Age'] <= 22), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 22) & (dataset['Age'] <= 27), 'Age'] = 3
    dataset.loc[(dataset['Age'] > 27) & (dataset['Age'] <= 33), 'Age'] = 4
    dataset.loc[(dataset['Age'] > 33) & (dataset['Age'] <= 40), 'Age'] = 5
    dataset.loc[(dataset['Age'] > 40) & (dataset['Age'] <= 66), 'Age'] = 6
    dataset.loc[ dataset['Age'] > 66, 'Age'] = 6

# let's see how it's distributed train_df['Age'].value_counts()

In [None]:
train_df.head(10)

In [None]:
data = [train_df, test_df]

for dataset in data:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[(dataset['Fare'] > 31) & (dataset['Fare'] <= 99), 'Fare']   = 3
    dataset.loc[(dataset['Fare'] > 99) & (dataset['Fare'] <= 250), 'Fare']   = 4
    dataset.loc[ dataset['Fare'] > 250, 'Fare'] = 5
    dataset['Fare'] = dataset['Fare'].astype(int)

In [None]:
data = [train_df, test_df]
for dataset in data:
    dataset['Age_Class']= dataset['Age']* dataset['Pclass']

In [None]:
for dataset in data:
    dataset['Fare_Per_Person'] = dataset['Fare']/(dataset['relatives']+1)
    dataset['Fare_Per_Person'] = dataset['Fare_Per_Person'].astype(int)
# Let's take a last look at the training set, before we start training the models.
train_df.head(10)

## Prepare the data for model training
### Separate the features (X_train) and target variable (Y_train)

In [None]:
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.drop("PassengerId", axis=1).copy()