> # **بسم الله الرحمن الرحيم** 

# Titanic - Survival Prediction 


# Step 1: Import Libraries


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

import re

# Step 2: Load the Data


In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print(f"Train shape: {train.shape}, Test shape: {test.shape}")


Train shape: (891, 12), Test shape: (418, 11)


# Step 3: Feature Engineering


In [3]:
def extract_title(name):
    # Extract title from name (Mr, Mrs, Miss, etc.)
    match = re.search(r',\s*([^\.]+)\.', name)
    if match:
        return match.group(1).strip()
    return ""

train['Title'] = train['Name'].apply(extract_title)
test['Title'] = test['Name'].apply(extract_title)

# Replace rare titles with 'Rare'
rare_titles = ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 
               'Sir', 'Jonkheer', 'Dona']
train['Title'] = train['Title'].replace(rare_titles, 'Rare')
test['Title'] = test['Title'].replace(rare_titles, 'Rare')

# Group similar titles
train['Title'] = train['Title'].replace(['Mlle', 'Ms'], 'Miss')
train['Title'] = train['Title'].replace('Mme', 'Mrs')
test['Title'] = test['Title'].replace(['Mlle', 'Ms'], 'Miss')
test['Title'] = test['Title'].replace('Mme', 'Mrs')

# Step 4: Fill Missing Values

In [4]:
# Age - fill by median age for each Title group
for dataset in [train, test]:
    dataset['Age'] = dataset['Age'].fillna(dataset.groupby('Title')['Age'].transform('median'))

train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mode()[0])
test['Embarked'] = test['Embarked'].fillna(test['Embarked'].mode()[0])
test['Fare'] = test['Fare'].fillna(test['Fare'].median())

# Step 5: Create New Features

In [5]:
# Family size = SibSp + Parch + 1 (self)
for dataset in [train, test]:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

# Is alone feature
for dataset in [train, test]:
    dataset['IsAlone'] = 1  # Initialize to yes/1 is alone
    dataset.loc[dataset['FamilySize'] > 1, 'IsAlone'] = 0

# Extract deck from Cabin
for dataset in [train, test]:
    dataset['Deck'] = dataset['Cabin'].str[0]
    dataset['Deck'] = dataset['Deck'].fillna('Unknown')


# Step 6: Encoding Categorical Features

In [6]:
# Label encode 'Sex', 'Embarked', 'Title', 'Deck'
label_cols = ['Sex', 'Embarked', 'Title', 'Deck']
le = LabelEncoder()
for col in label_cols:
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])

# Step 7: Drop Unnecessary Columns

In [7]:
drop_cols = ['Name', 'Ticket', 'Cabin']
train.drop(drop_cols, axis=1, inplace=True)
test.drop(drop_cols, axis=1, inplace=True)

# Step 8: Prepare Training and Test Data


In [8]:
# Step 1: Drop PassengerId before training
X = train.drop(['Survived', 'PassengerId'], axis=1)
y = train['Survived']

# Step 2: Define model
best_rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Step 3: Retrain model
best_rf.fit(X, y)

# Step 4: Prepare test set
X_test_final = test.drop('PassengerId', axis=1)

# Step 5: Predict
predictions = best_rf.predict(X_test_final)

# Step 6: Create submission
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': predictions
})
submission.to_csv('submission.csv', index=False)


# Step 9: Hyperparameter Tuning with GridSearchCV

In [None]:
rf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [4, 6, 8, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=1, scoring='accuracy')

grid_search.fit(X, y)

print(f"Best params: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.4f}")

# Step 10: Train Final Model with Best Params

In [None]:
best_rf = grid_search.best_estimator_

# Optional: cross-validation score
cv_scores = cross_val_score(best_rf, X, y, cv=5)
print(f"CV Accuracy: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")

# Fit on full training data
best_rf.fit(X, y)

# Step 11: Predict & Create Submission

In [None]:
predictions = best_rf.predict(X_test_final)

submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': predictions
})

submission.to_csv('submission.csv', index=False)
submission.head()

