# 🚢 Titanic Survival Prediction Project
### Goal: Predict whether a passenger survived the Titanic disaster using machine learning
This notebook walks through a complete data science workflow on the Titanic dataset from Kaggle.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline
sns.set(style="whitegrid")


In [None]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")


In [None]:
# Visualizations
sns.countplot(x='Survived', data=train_df)
plt.title("Survival Counts (0 = No, 1 = Yes)");


In [None]:
sns.barplot(x='Sex', y='Survived', data=train_df)
plt.title("Survival Rate by Sex");


In [None]:
sns.barplot(x='Pclass', y='Survived', data=train_df)
plt.title("Survival Rate by Passenger Class");


In [None]:
plt.figure(figsize=(10,5))
train_df["Age"].hist(bins=30, edgecolor='black')
plt.title("Age Distribution of Passengers");


In [None]:
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)
train_df.drop(['Cabin', 'Ticket'], axis=1, inplace=True)

# Encoding
train_df['Sex'] = train_df['Sex'].map({'male': 0, 'female': 1})
train_df['Embarked'] = train_df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

# New features
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
train_df['IsAlone'] = (train_df['FamilySize'] == 1).astype(int)
train_df['AgeGroup'] = pd.cut(train_df['Age'], bins=[0, 12, 18, 50, 100], labels=[0,1,2,3])
train_df['AgeGroup'] = train_df['AgeGroup'].astype(float)

# Drop unused columns
train_df.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Age'], axis=1, inplace=True)


In [None]:
test_df['Age'] = test_df['Age'].fillna(train_df['Age'].median())
test_df['Fare'] = test_df['Fare'].fillna(test_df['Fare'].median())
test_df['Embarked'] = test_df['Embarked'].fillna(test_df['Embarked'].mode()[0])
test_df.drop(['Cabin', 'Ticket'], axis=1, inplace=True, errors='ignore')

# Encoding
test_df['Sex'] = test_df['Sex'].map({'male': 0, 'female': 1})
test_df['Embarked'] = test_df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

# New features
test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch'] + 1
test_df['IsAlone'] = (test_df['FamilySize'] == 1).astype(int)
test_df['AgeGroup'] = pd.cut(test_df['Age'], bins=[0, 12, 18, 50, 100], labels=[0,1,2,3])
test_df['AgeGroup'] = test_df['AgeGroup'].astype(float)

# Drop unused columns
test_df.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Age'], axis=1, inplace=True, errors='ignore')


In [None]:
X = train_df.drop("Survived", axis=1)
y = train_df["Survived"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_val)

print("Logistic Regression Accuracy:", accuracy_score(y_val, y_pred_log))
print(classification_report(y_val, y_pred_log))


In [None]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_val)

print("Random Forest Accuracy:", accuracy_score(y_val, y_pred_rf))
print(classification_report(y_val, y_pred_rf))
