<a href="https://www.kaggle.com/code/miriamodeyianypeter/titanic-machine-learning-from-disaster?scriptVersionId=144136839" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Importing the Necessary Libraries**

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.metrics import classification_report, accuracy_score, top_k_accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier

# **Import the dataset and Data Cleaning**

In [None]:
test = pd.read_csv("/kaggle/input/titanic/test.csv")
train = pd.read_csv("/kaggle/input/titanic/train.csv")

In [None]:
train.head()

In [None]:
test.head() #our test data doesn't have our target variable survived which is what our model will predict. 

In [None]:
train.tail()

In [None]:
test.tail()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

From the above we can see that both our test and train data have some missing values, we will be taking care of that before making our visualizations

# **Missing Values**

In [None]:
#dealing with the numerical column first in both data set
#imputing the missing values with mean value
train["Age"].fillna(train["Age"].mean(), inplace = True)
test["Age"].fillna(test["Age"].mean(), inplace = True)
test["Fare"].fillna(round(test["Fare"].mean(), 2), inplace = True)

#dealing with the categorical columns with missing values in both data set
train_cat = ["Cabin", "Embarked"]
test_cat =  ["Cabin", "Embarked"]

#imputing the missing values using one 
for i in train[train_cat]:
    train[i].fillna(train[i].mode()[0],inplace=True)
    
for i in test[test_cat]:
    test[i].fillna(test[i].mode()[0], inplace = True)
    
#confirming if both data set have no missing values
train.info()
print(test.info())

# **Data Cleaning**

In [None]:
#changing the datatype of the Age column to integer in both the test and train data
train["Age"]= train["Age"].astype(int)
test["Age"] = test["Age"].astype(int)

#changing the values in the train fare column to 2 d.p.
train["Fare"] = round(train["Fare"], 2)
test["Fare"] = round(test["Fare"], 2)

#renaming the sex column
train = train.rename(columns={'Sex': 'Gender'})
test = test.rename(columns={'Sex': 'Gender'})

In [None]:
train.head()

In [None]:
train["Parch"].nunique()

# **Data Visualizations**

The train data will be used for the data visualizations

**Univariate Analysis**

In [None]:
#survival distribution
survived_counts = train['Survived'].value_counts().reset_index()
survived_counts.columns = ['Survived', 'count']

fig = px.pie(survived_counts, names='Survived', values='count', hole=0.2,
             title="Survival Rate", color_discrete_sequence=px.colors.sequential.RdBu)
fig.update_traces(textinfo='percent+label', pull=0.05)

fig.show()
# 1 means the passenger survived and zero means they didn't

In [None]:
#Gender distribution
gender = train["Gender"].value_counts()
gender_labels = gender.index.tolist()  # Convert the dict_keys to a list
gender_values = gender.tolist()
fig = go.Figure(data=[go.Pie(labels=gender_labels, values=gender_values, pull=[0, 0.2])])
fig.update_layout(title_text="Gender Distribution")
fig.show()

In [None]:
plt.figure(figsize = (18,6))
age = train["Age"].value_counts().reset_index()
sns.barplot(age, x = "Age", y = "count")
plt.xlabel("Age")
plt.xticks(rotation = 90)
plt.ylabel("Count")
plt.title("Age Distribution");

**Bivariate Analysis**

In [None]:
sns.pairplot(train)
plt.tight_layout();

In [None]:
plt.figure(figsize = (8,6))
sns.lmplot(x= "Age", y = "Fare", data= train)
plt.title("Relationship between Age and Fare")
plt.tight_layout();

In [None]:
#creating a 3d scatter plot of the numerical columns
fig = px.scatter_3d(train,
                    x='Age',
                    y='Fare',
                    z='Survived',
                    color='Gender',
                    color_discrete_map={'male': 'yellow', 'female': 'brown'},
                    title="3D Scatter Plot of Age, Fare, and Survival Status by Gender")
fig.show();

In [None]:
fig = px.scatter_ternary(train, a="Pclass", b="SibSp", c="Parch", color="Gender", size = "Fare")
fig.update_layout(title_text="Pclass vs SibSp and Parch based on Gender and Fare")
fig.show()

In [None]:
sns.heatmap(train.corr(numeric_only = True), annot = True, cmap = sns.cubehelix_palette(as_cmap=True));

# **Model Building**

In [None]:
#instantiate the model
model = RandomForestClassifier()

#target variable
y = train["Survived"]

#feature variables
df_X  = train.drop(columns = ["Survived", "Name","Cabin", "Embarked","Ticket", "SibSp", "Parch", "Age"])
df_X_test = test.drop(columns = ["Name","Cabin", "Embarked","Ticket", "SibSp", "Parch", "Age"])

#taking care of categorical variables
X = pd.get_dummies(df_X).astype(float)
X_test = pd.get_dummies(df_X_test).astype(float)

#print(X.info())
X.head()

**Model fitting without hyperparameter Tuning**

In [None]:
#model fitting
model.fit(X,y)

#make predictions
y_pred = model.predict(X_test)

#model evaluation 
cross_val_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

# Print the cross-validation scores for each fold
print("Cross-Validation Scores:", cross_val_scores)

# Calculate and print the mean and standard deviation of the cross-validation scores
mean_accuracy = cross_val_scores.mean()
std_deviation = cross_val_scores.std()
print(f"Mean Accuracy: {mean_accuracy:.2f}")
print(f"Standard Deviation: {std_deviation:.2f}")

without hyperparameter tuning we have a mean accuracy of 0.78. Now let us try using a GridSearchCv

**Using GridSearchCv for Hyper Parameter Tuning**

In [None]:
#rfc =  RandomForestClassifier
#splitting the data into test and training 
X_train, X_test, y_train, y_test = train_test_split(X.drop(["PassengerId"], axis = 1), y, test_size = 0.01)

#param grids for grid search cv
param_grid = [{
    "n_estimators": [15, 17, 20, 100, 300],
    "max_depth": [None,5, 6, 8, 10],
    "min_samples_split": [2,4,6,8]
}]

#instantiate grid search cv
grid_search = GridSearchCV(estimator = model, param_grid=param_grid, scoring='accuracy',return_train_score = True)

#fit the data
grid_search.fit(X_train, y_train)

In [None]:
param_grid = [{
    'n_estimators': [10, 15, 20, 100,300],
    'max_depth': [None, 5, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}]

In [None]:
#getting the best estimator
best_model = grid_search.best_estimator_
best_model

In [None]:
#testing on our test data
test_accuracy = best_model.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

Okay so let us test it on the test data we were given

In [None]:
df_X_test_1 = test.drop(columns = ["Name","Cabin", "Embarked","Ticket", "SibSp", "Parch", "Age"])

#taking care of categorical variables
X_test_1 = pd.get_dummies(df_X_test_1).astype(float)


#making predictions
test_data = X_test_1.drop("PassengerId", axis=1)
y_preds = np.ndarray.tolist(grid_search.predict(test_data))


In [None]:
#saving my predictions
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': y_preds})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")
output.head()

# **Feature Importance**

In [None]:
# Get feature importances from the model
feature_importances = model.feature_importances_

# Create a DataFrame to associate feature names with their importance scores (assuming your features are named)
feature_names = list(X.columns)  # Replace with your feature names
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})

# Sort the DataFrame by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Create a bar plot to visualize feature importance
plt.figure(figsize=(18, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df, palette = "mako")
plt.title('Feature Importance')
plt.xlabel('Importance Score')