# Titanic Survival Prediction - Model Development

This notebook covers Part A of the project:
1. Loading the Titanic dataset
2. Data Preprocessing
3. Model Training (Random Forest)
4. Evaluation
5. Saving the Model

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib
import os

## 1. Load Dataset

In [None]:
# Loading the dataset from a URL typically used for this competition
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

# Display first few rows
df.head()

## 2. Data Preprocessing

In [None]:
# Select required features
features = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked']
target = 'Survived'

data = df[features + [target]].copy()

# Handling missing values
data['Age'].fillna(data['Age'].median(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
data['Fare'].fillna(data['Fare'].median(), inplace=True)

# Encoding categorical variables
le_sex = LabelEncoder()
data['Sex'] = le_sex.fit_transform(data['Sex'])

le_embarked = LabelEncoder()
data['Embarked'] = le_embarked.fit_transform(data['Embarked'])

# Verify no missing values
print(data.isnull().sum())

## 3. Model Training

In [None]:
X = data[features]
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

## 4. Evaluation

In [None]:
y_pred = model.predict(X_test)
print("Classification Report:\n")
print(classification_report(y_test, y_pred))

## 5. Save Model

In [None]:
# Determine execution path - ensures model is saved in the same directory or correct relative path
# Since this script will be in /model/ directory, we save it there.
model_filename = 'titanic_survival_model.pkl'
joblib.dump(model, model_filename)

print(f"Model saved as {model_filename}")

## 6. Reload and Test

In [None]:
# Load model
loaded_model = joblib.load(model_filename)

# Test prediction with a dummy input
# Pclass=3, Sex=1(male), Age=22, Fare=7.25, Embarked=2(S)
dummy_input = [[3, 1, 22, 7.25, 2]]
prediction = loaded_model.predict(dummy_input)
print(f"Prediction for dummy input: {'Survived' if prediction[0] == 1 else 'Did not survive'}")