<a href="https://colab.research.google.com/github/Simarjit1303/Data-Science/blob/main/exercises/machine-learning/supervised-learning/titanic_competition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Titanic Competition
You should build an end-to-end machine learning pipeline to predict survivors of the Titanic disaster and participate in the corresponding Kaggle competition. In particular, you should do the following:
- Read the Titanic competition page on [Kaggle](https://www.kaggle.com/competitions/titanic/overview).
- Load the `titanic` dataset using [Pandas](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html). You can find this dataset in the datasets folder.
- Split the dataset into training and test sets using [Scikit-Learn](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html).
- Build an end-to-end machine learning pipeline, including all necessary steps, to have a running solution with some performance.
- Collaborate with your groupmates to finalize your pipeline by
    - reading the discussion forum to learn from other community members;
    - discussing the bottlenecks of your current solution;
    - running experiments on your pipeline;
    - improving the performance of your pipeline.
- Test the best pipeline on the test set and report various [evaluation metrics](https://scikit-learn.org/0.15/modules/model_evaluation.html).  
- Present your pipeline.
- Submit your predictions to Kaggle.

In [124]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import accuracy_score
# classifiers
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
# from xgboost import XGBClassifier

# Data Exploartion and Preprocessing

In [125]:
dataset = pd.read_csv('https://raw.githubusercontent.com/m-mahdavi/teaching/refs/heads/main/datasets/titanic.csv')
# dataset.head(10)

In [126]:
# dataset.describe()

In [127]:
# checking missing values
# dataset.isnull().sum()

In [128]:
# # Addind missing values in 'Age' with the median
# dataset['Age'].fillna(dataset['Age'].median(), inplace=True)

# # Adding missing values in 'Embarked' with the mode
# dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace=True)

# # Impute missing values in 'Cabin' with 'Unknown'
# dataset['Cabin'].fillna('Unknown', inplace=True)

# # Verifing if there are any remaining missing values
# dataset.isnull().sum()


# Feature Engineering

In [129]:
# 1. Title Extraction
dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

# 2. Creating a new column to put family under one category
dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

# 3. IsAlone
dataset['IsAlone'] = 0
dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

# 4. Grouping passengers into categories
dataset['AgeBin'] = pd.cut(dataset['Age'], bins=[0, 12, 20, 60, 100], labels=['Child', 'Young Adult', 'Adult', 'Senior'])
dataset['FareBin'] = pd.qcut(dataset['Fare'], 4, labels=['Low', 'Medium', 'High', 'Very High'])
dataset['CabinDeck'] = dataset['Cabin'].str[0].fillna('U')


# Feature Scaling

In [130]:
# 1. Defining features and target
numerical_features = ['Pclass', 'Age', 'Fare', 'FamilySize', 'IsAlone']
categorical_features = ['Sex', 'Embarked', 'Title', 'AgeBin', 'FareBin', 'CabinDeck']
target = 'Survived'

# 2. Spliiting data into test and train data
X = dataset[numerical_features + categorical_features]
y = dataset[target]
X_train_data, X_test_data, y_train, y_test = train_test_split(X, y, random_state=42)

# 4. Create preprocessing pipelines
numerical_pipeline = Pipeline([
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', StandardScaler()),
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])


# Data Modelling

In [None]:
# Define models and hyperparameter grids
models = {
    'Logistic Regression': {
        'model': LogisticRegression(random_state=42),
        'param_dist': {
            'classifier__C': [0.1, 1, 10]
        }
    },
    'Decision Tree': {
        'model': DecisionTreeClassifier(random_state=42),
        'param_dist': {
            'classifier__max_depth': [3, 5, 7],
            'classifier__min_samples_split': [2, 5, 10]
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'param_dist': {
            'classifier__n_estimators': [50,100],
            'classifier__max_depth': [3, 5, 7],
            'classifier__min_samples_split': [5, 10],
            'classifier__min_samples_leaf': [1, 2, 4],
            'classifier__max_features': ['auto','sqrt', 'log2']
        }
    },
    'Support Vector Machine': {
        'model': SVC(random_state=42),
        'param_dist': {
            'classifier__C': [0.1, 1, 10],
            'classifier__kernel': ['linear', 'rbf']
        }
    },
    'K-Nearest Neighbors': {
        'model': KNeighborsClassifier(),
        'param_dist': {
            'classifier__n_neighbors': [3, 5, 7],
            'classifier__weights': ['uniform', 'distance']
        }
    },
    'Naive Bayes': {
        'model': GaussianNB(),
        'param_dist': {}
    }
    # 'XGBoost': {
    #     'model': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    #     'param_dist': {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1], 'max_depth': [3, 5]}
    # }
}

#  Perform Randomized Search CV for each model
best_model = None
best_accuracy = 0

for model_name, model_data in models.items():

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model_data['model'])
    ])

    random_search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=model_data['param_dist'],
        n_iter=50,
        scoring='accuracy',
        cv=5,
        random_state=42,
        n_jobs=-1
    )
    random_search.fit(X_train_data, y_train)

    # Evaluate on the test set
    y_pred = random_search.predict(X_test_data)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"{model_name} Accuracy (Test): {accuracy}")
    print(f"{model_name} Best Hyperparameters: {random_search.best_params_}\n")


    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = random_search.best_estimator_
        best_model_name = model_name

# Print the best model and its accuracy
print(f"The best model is: {best_model_name} with accuracy: {best_accuracy * 100:.2f}%")



Logistic Regression Accuracy (Test): 0.8116591928251121
Logistic Regression Best Hyperparameters: {'classifier__C': 1}





Decision Tree Accuracy (Test): 0.820627802690583
Decision Tree Best Hyperparameters: {'classifier__min_samples_split': 2, 'classifier__max_depth': 3}

