In [None]:
import pandas as pd

In [None]:
import matplotlib.pyplot as plt

In [None]:
import seaborn as sns

In [None]:
data = pd.read_csv("titanic_dataset.csv")

In [None]:
data.head()

In [None]:
data.info()

In [None]:
# some missing values in age, cabin and embarked columns.

In [None]:
# target - Survived

In [None]:
# pclass, name, sex, ticket, cabin, embarked - categorical features
# passengerID, age, fare - Numerical features
# name, ticket and cabin are nominal scale, so we can drop these columns.
# in numerical columns passengerid is also not necessary.

In [None]:
data.describe()

In [None]:
data.drop(columns=['PassengerId', 'Name','Ticket','Cabin'], inplace=True)

In [None]:
data.head()

In [None]:
data['Survived'].value_counts()

In [None]:
sns.countplot(data=data, x='Survived', palette='Blues')
plt.title('Passenger Survival Count')
plt.xlabel('Survived (0 = No, 1 = Yes)')
plt.ylabel('Number of Passengers')
plt.grid(axis='y')
plt.show()

In [None]:
# data is balanced

In [None]:
data['Pclass'].value_counts()

In [None]:
data['Sex'].value_counts()

In [None]:
df['Survived'] = df['Survived'].map({0: 'No', 1: 'Yes'})

sns.countplot(data=data, x='Sex', hue='Survived', palette='Set2')
plt.title('Survival Count by Gender')
plt.xlabel('Gender')
plt.ylabel('Number of Passengers')
plt.legend(title='Survived')
plt.grid(axis='y')
plt.tight_layout()
plt.show()


In [None]:
data['Embarked'].value_counts()

##### Handling Missing Values

In [None]:
data1 = data.copy()

In [None]:
data1.head()

In [None]:
data1['Age'].fillna(data1['Age'].median(),inplace=True)

In [None]:
data1['Embarked'].fillna(data1['Embarked'].mode(),inplace=True)

In [None]:
data1.info()

In [None]:
import matplotlib.pyplot as plt

plt.hist(data1['Age'].dropna(), bins=20, color='skyblue', edgecolor='black')
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.kdeplot(data=data1, x='Age', fill=True, color='green')
plt.title('Density Plot of Age')
plt.xlabel('Age')
plt.ylabel('Density')
plt.grid(True)
plt.show()


In [None]:
# age is skewed in right side

In [None]:
import matplotlib.pyplot as plt

plt.hist(data1['Fare'].dropna(), bins=20, color='red', edgecolor='black')
plt.title('Distribution of Fare')
plt.xlabel('Fare')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.kdeplot(data=data1, x='Fare', fill=True, color='green')
plt.title('Density Plot of Fare')
plt.xlabel('Fare')
plt.ylabel('Density')
plt.grid(True)
plt.show()


In [None]:
# this plot showing fare has lots of statistical outliers.

In [None]:
plt.figure(figsize=(10, 5))

# Age vs Survived
sns.boxplot(data=data1, x='Survived', y='Age', palette='pastel')
plt.title('Age Distribution by Survival')
plt.xlabel('Survived (0 = No, 1 = Yes)')
plt.ylabel('Age')
plt.grid(True)
plt.show()


In [None]:
plt.figure(figsize=(10, 5))

# Fare vs Survived
sns.boxplot(data=data1, x='Survived', y='Fare', palette='Set3')
plt.title('Fare Distribution by Survival')
plt.xlabel('Survived (0 = No, 1 = Yes)')
plt.ylabel('Fare')
plt.grid(True)
plt.show()


In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=data1, x='Age', y='Fare', hue='Survived')
plt.title("Age vs Fare by Survival")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
corr = data1.corr(numeric_only=True)
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

In [None]:
# positive correlation - survives and fare
# negative correlation - survived and pclass

In [None]:
'''
Summary of Findings:
- Survival rate is less. Most passengers did not survive.
- Females had a higher chance of survival than males.
- Passengers in 1st class had a higher survival rate.
- Younger passengers and those who paid higher fares had better survival chances.
- Strong negative correlation between Pclass and Survived.
- Fare has a slight positive correlation with Survived.
'''