In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
path = r'C:\Users\galig\Downloads\Dataset.csv'
titanic_data = pd.read_csv(path, on_bad_lines='skip')

In [None]:
# Display basic info and summary statistics
print(titanic_data.info())
print(titanic_data.describe())

In [None]:
# Visualize missing data
plt.figure(figsize=(10, 6))
sns.heatmap(titanic_data.isnull(), cbar=False, cmap="viridis")
plt.title("Missing Values Heatmap")
plt.show()

In [None]:
# Survival Analysis by Gender
sns.countplot(data=titanic_data, x="Survived", hue="Sex")
plt.title("Survival by Gender")
plt.show()

In [None]:
# Survival by Age Groups
titanic_data["AgeGroup"] = pd.cut(titanic_data["Age"], bins=[0, 12, 18, 35, 60, 80], 
                                  labels=["Child", "Teen", "Adult", "Middle Aged", "Senior"])
sns.countplot(data=titanic_data, x="AgeGroup", hue="Survived")
plt.title("Survival by Age Group")
plt.show()

In [None]:
# Survival by Embarked
sns.countplot(data=titanic_data, x="Embarked", hue="Survived")
plt.title("Survival by Embarkation Point")
plt.show()

In [None]:

numerical_data = titanic_data.select_dtypes(include=['float64', 'int64'])
numerical_data = numerical_data.fillna(numerical_data.mean())
correlation_matrix = numerical_data.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

In [None]:
# Feature Engineering: Family Size
titanic_data["FamilySize"] = titanic_data["SibSp"] + titanic_data["Parch"] + 1
sns.barplot(data=titanic_data, x="FamilySize", y="Survived")
plt.title("Survival by Family Size")
plt.show()

In [None]:
# Feature Engineering: Titles from Names
titanic_data["Title"] = titanic_data["Name"].str.extract(" ([A-Za-z]+)\.", expand=False)
sns.countplot(data=titanic_data, x="Title", hue="Survived", order=titanic_data["Title"].value_counts().index)
plt.title("Survival by Title")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Survival by Fare
sns.boxplot(data=titanic_data, x="Survived", y="Fare")
plt.title("Fare Distribution by Survival")
plt.show()
