In [1]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer


load data

In [None]:

data = pd.read_csv("data/titanic.csv")

print("Shape of Dataset:", data.shape)
print("\nDataset Info:")
print(data.info())

print("\nSummary Statistics:")
print(data.describe(include='all'))


preprocess

In [None]:

age_imputer = SimpleImputer(strategy='mean')
data['Age'] = age_imputer.fit_transform(data[['Age']])

data['Cabin'] = data['Cabin'].fillna('Unknown')
data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])

print("\nMissing Values After Imputation:")
print(data.isnull().sum())


In [None]:

plt.figure(figsize=(6,4))
sns.countplot(x='Pclass', data=data, palette='Set2')
plt.title('Passenger Class Distribution')
plt.show()


females survived

In [None]:

female_survived = data[(data['Sex'] == 'female') & (data['Survived'] == 1)]
print("Names of Female Passengers Who Survived:")
print(female_survived['Name'])


under 18

In [None]:

third_class_under_18 = data[(data['Pclass'] == 3) & (data['Age'] < 18)]
print("Passengers in 3rd Class and Under 18:")
print(third_class_under_18[['Name', 'Age', 'Pclass']])


class 1 older than 40

In [None]:

class1_above40 = data[(data['Pclass'] == 1) & (data['Age'] > 40)]
print("Passengers in Class 1 and Older Than 40:")
print(class1_above40[['Name', 'Age', 'Pclass']])


survived class 1 and age 40

In [None]:

survived_class1_above40 = class1_above40[class1_above40['Survived'] == 1]
print("Survived Passengers (Class 1 & Age > 40):")
print(survived_class1_above40[['Name', 'Age', 'Survived']])


fare > 100 

In [None]:

rich_males = data[(data['Sex'] == 'male') & (data['Fare'] > 100)]
print("Male Passengers Who Paid Fare > 100:")
print(rich_males[['Name', 'Fare']])


port C class 2

In [None]:

embarked_C_class2 = data[(data['Embarked'] == 'C') & (data['Pclass'] == 2)]
print("Passengers from Port 'C' and in Class 2:")
print(embarked_C_class2[['Name', 'Embarked', 'Pclass']])


> 2 siblings

In [None]:

sibsp_more2 = data[data['SibSp'] > 2]
print("Passengers with More Than 2 Siblings/Spouses:")
print(sibsp_more2[['Name', 'SibSp']])


died and no family

In [None]:

no_family_died = data[(data['Survived'] == 0) & (data['SibSp'] == 0) & (data['Parch'] == 0)]
print("Passengers Who Died and Had No Family:")
print(no_family_died[['Name', 'Survived', 'SibSp', 'Parch']])


5 oldest survived

In [None]:

oldest_survivors = data[data['Survived'] == 1].sort_values(by='Age', ascending=False).head(5)
print("Top 5 Oldest Passengers Who Survived:")
print(oldest_survivors[['Name', 'Age', 'Survived']])


0 fare

In [None]:

free_passengers = data[data['Fare'] == 0]
print("Passengers Who Paid Zero Fare:")
print(free_passengers[['Name', 'Fare']])


spliting dataset

In [None]:

X = data[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]
y = data['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)
