In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../input/titanic/train.csv')

# Exploratory Data Analysis

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df['Cabin'].value_counts()

In [None]:
df.isna().sum()

## Age Visualizations

In [None]:
# Age Distribution Plot
df['Age'].hist(grid=False)
plt.title("Age Distribution of Titanic Passengers")
plt.xlabel("Age")
plt.ylabel("Frequency");

In [None]:
df['Age'].describe()

In [None]:
# Fill missing values of the Age column by the median
df['Age'].fillna(df['Age'].median(),inplace=True)

In [None]:
# Age Distribution Plot After Imputation
df['Age'].hist(grid=False)
plt.title("Age Distribution of Titanic Passengers After Imputation")
plt.xlabel("Age")
plt.ylabel("Frequency");

In [None]:
sns.boxplot(y='Age',data=df)
plt.title("Age Boxplot");

In [None]:
# 1: Upper Class, 2:Middle Class, 3:Lower Class
# Boxplot: Age by passenger class
plt.figure(figsize=(8, 6))
sns.boxplot(x='Pclass', y='Age', data=df, palette='Set2')
plt.title("Age Distribution by Passenger Class", fontsize=16)
plt.xlabel("Passenger Class", fontsize=14)
plt.ylabel("Age", fontsize=14)
plt.show()

In [None]:
conditions = [
    (df['Age'] < 18),  
    (df['Age'] >= 18) & (df['Age'] <= 59),  
    (df['Age'] >= 60)  
]
choices = ['Child', 'Adult', 'Senior']
df['Age_Group'] = np.select(conditions, choices, default='Unknown')

In [None]:
# Plot to show the distribution of Pclass among those who Survived/Died
sns.countplot(x="Age_Group", hue="Survived", data=df,order=['Child', 'Adult', 'Senior'])
plt.xticks(ticks=[0, 1,2], labels=['Child','Adult', 'Senior'])
plt.legend(title='Survival Status', labels=['Did not survive', 'Survived'])
plt.xlabel("Age Group")
plt.ylabel("Frequency")
plt.title("Survival Status by Age Group on the Titanic");

In [None]:
df.drop(columns=["Age_Group"],inplace=True)

## Dropping column Cabin

In [None]:
# Drop the cabin column as there are too many missing values
df.drop(columns=['Cabin'],axis=1,inplace=True)

In [None]:
df.columns

## Embarked Visualization

In [None]:
df[df['Embarked'].isna()]

In [None]:
df['Embarked'].value_counts()

In [None]:
# Fill or drop?
# Fill missing values of the Embarked column by the mode
df['Embarked'].fillna(df['Embarked'].mode()[0],inplace=True)

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
#C = Cherbourg, Q = Queenstown, S = Southampton
loc = {
    "C" : "Cherbourg",
    "Q": "Queenstown",
    "S":"Southampton"
}

df["Embarked"].replace(loc).value_counts().plot(kind="bar")
# df_train['Embarked'].value_counts().plot(kind="bar")
plt.title("Passengers Embarked Location")
plt.xlabel("Embarked Location")
plt.ylabel("Frequency");

In [None]:
df['Sex'] = df['Sex'].map({'male':0,'female':1})

In [None]:
# Convert age to integer datatype
#df_train['Age'] = df_train['Age'].astype('int')

## Siblings/Spouses Visualization

In [None]:
# Visualizing number of siblings and spouses
df['SibSp'].value_counts().sort_index().plot(kind="bar")
plt.xlabel("Number of Siblings/Spouses")
plt.ylabel("Frequency")
plt.title("Distribution of Siblings/Spouses Aboard the Titanic");

## Parents/Children Visualization

In [None]:
# Visualizing number of parents and children
df['Parch'].value_counts().sort_index().plot(kind="bar")
plt.xlabel("Number of Parents/Children")
plt.ylabel("Frequency")
plt.title("Distribution of Parents/Children Aboard the Titanic");

## Dropping Column Ticket

In [None]:
# Drop the Ticket column as it holds no significance
df.drop(columns=['Ticket'],inplace=True)

## Gender Distribution among Survived/Died

In [None]:
# Plot to show the distribution of Sex among those who Survived/Died
sns.countplot(x="Sex", hue="Survived", data=df)
plt.xticks(ticks=[0, 1], labels=['male','female'])
plt.legend(title='Survival Status', labels=['Did not survive', 'Survived'])
plt.title("Distribution of Sex among Survived/Died")
plt.xlabel("Sex")
plt.ylabel("Frequency");

In [None]:
df.groupby('Sex')['Survived'].value_counts()

## Passenger Class Distribution among Survived/Dead

In [None]:
# Plot to show the distribution of Pclass among those who Survived/Died
sns.countplot(x="Pclass", hue="Survived", data=df)
plt.xticks(ticks=[0, 1,2], labels=['High Class','Mid Class','Low Class'])
plt.legend(title='Survival Status', labels=['Did not survive', 'Survived'])
plt.title("Distribution of Passenger Class among those who Survived/Died");

In [None]:
df['Pclass'].value_counts()

In [None]:
df.groupby('Pclass')['Survived'].value_counts()

## Ticket Prices Visualization

In [None]:
#Ticket Fare
df["Fare"].hist(grid=False)
plt.title("Ticket Prices of Titanic")
plt.xlabel("Price")
plt.ylabel("Frequency");

## Coorelation between numeric columns

In [None]:
target = "Survived"
# Checking coorelation among the data
corr = df.drop(columns=target).select_dtypes(exclude="object").corr()
sns.heatmap(corr)

## Class Balance

In [None]:
# Class Balance
df["Survived"].value_counts().plot(kind="bar")
plt.xlabel("Survival")
plt.ylabel("Count")
plt.title("Class Balance")
plt.show();

In [None]:
# Class Balance
df["Survived"].value_counts(normalize=True).plot(kind="bar")
plt.xlabel("Survival")
plt.ylabel("Percentage")
plt.title("Class Balance")
plt.show()

In [None]:
total_corr = df.drop(columns=['PassengerId']).select_dtypes(exclude='object').corr()
sns.heatmap(total_corr)

In [None]:
total_corr

In [None]:
from statsmodels.formula.api import logit

In [None]:
def cal_accuracy(df, column):
    formula = f'Survived ~ {column}'
    mdl = logit(formula,data=df).fit()
    conf_matrix = mdl.pred_table()
    TN=conf_matrix[0,0]
    TP=conf_matrix[1,1]
    FP=conf_matrix[0,1]
    FN=conf_matrix[1,0]
    accuracy = (TP + TN) / (TP + TN + FP + FN)
    return accuracy

In [None]:
acc_columns = ['Pclass', 'Sex', 'Age', 'SibSp',
       'Parch', 'Fare', 'Embarked']
accuracies = []
for column in acc_columns:
    accuracies.append((column,cal_accuracy(df,column)))

In [None]:
sorted(accuracies,key=lambda x:x[1],reverse=True)