In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

file_path = "titanic.csv"
df = pd.read_csv(file_path)

df = df.drop(columns=['Name', 'Ticket', 'Cabin', 'PassengerId'])

df['Age'] = df['Age'].fillna(df['Age'].median())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

X = df.drop('Survived', axis=1)
y = df['Survived']

num_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_features = X.select_dtypes(include=['object']).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(drop='first'), cat_features)
    ]
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Reduced param_grid for faster execution
param_grid = {
    'classifier__n_estimators': [50, 100],
    'classifier__max_depth': [5, 10],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2],
    'classifier__bootstrap': [True]
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=3, # Reduced cross-validation folds for faster execution
    scoring='f1',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("\n Best Hyperparameters:")
print(grid_search.best_params_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("\n Evaluation of Tuned Model:")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1-score: {f1:.4f}")

default_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])
default_pipeline.fit(X_train, y_train)
y_pred_default = default_pipeline.predict(X_test)

acc_def = accuracy_score(y_test, y_pred_default)
f1_def = f1_score(y_test, y_pred_default)

print("\n Default Model vs Tuned Model:")
print(f"Default F1-score: {f1_def:.4f}, Tuned F1-score: {f1:.4f}")



 Best Hyperparameters:
{'classifier__bootstrap': True, 'classifier__max_depth': 10, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 50}

 Evaluation of Tuned Model:
Accuracy: 0.7989
Precision: 0.7895
Recall: 0.6522
F1-score: 0.7143

 Default Model vs Tuned Model:
Default F1-score: 0.7519, Tuned F1-score: 0.7143


# Task 16: Hyperparameter Tuning using GridSearchCV (Titanic Dataset)

**Tools:** Python, Pandas, Scikit-learn  

## Steps:
1. **Load Dataset:** Import CSV and inspect data.  
2. **Preprocessing:**  
   - Drop irrelevant columns (`Name`, `Ticket`, `Cabin`, `PassengerId`)  
   - Fill missing values (`Age` median, `Embarked` mode)  
   - Encode categorical features using `OneHotEncoder`  
   - Scale numerical features using `StandardScaler`  
3. **Train-Test Split:** 80-20 split, stratified on `Survived`.  
4. **Pipeline Creation:**  
   - Use `ColumnTransformer` for preprocessing  
   - Combine with `RandomForestClassifier` in a `Pipeline`  
5. **Hyperparameter Tuning:**  
   - Define parameter grid for Random Forest  
   - Apply `GridSearchCV` with 5-fold cross-validation  
6. **Best Parameters & Evaluation:**  
   - Extract best hyperparameters  
   - Predict and evaluate on test set (Accuracy, Precision, Recall, F1-score)  
7. **Comparison:** Compare tuned model vs default model performance  

## Deliverables:
- Notebook with GridSearchCV implementation  
- Best parameters output  
- Performance comparison (default vs tuned model)
