In [1]:
# Question 2: Feature Engineering & Hyperparameter Tuning on the Titanic Dataset

# Step 1: Load the Titanic dataset (Assume you have a file named titanic.csv ).
# Step 2: Create features and handle missing values.
# Step 3: Train a pipeline using a Random Forest with GridSearchCV.
# Step 4: Evaluate the tuned model with cross-validation.
# Step 1: Load the Titanic dataset

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Sample Titanic-like data
data = {
    'Pclass': [3, 1, 3, 1, 3, 3, 2, 1, 3, 2],
    'Sex': ['male', 'female', 'female', 'female', 'male', 'male', 'female', 'male', 'female', 'female'],
    'Age': [22, 38, 26, 35, 35, None, 27, 54, 2, 30],
    'SibSp': [1, 1, 0, 1, 0, 0, 0, 0, 4, 0],
    'Parch': [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
    'Fare': [7.25, 71.2833, 7.925, 53.1, 8.05, 8.4583, 13.0, 51.8625, 21.075, 13.0],
    'Embarked': ['S', 'C', 'S', 'S', 'S', 'Q', 'S', 'S', 'C', 'S'],
    'Survived': [0, 1, 1, 1, 0, 0, 1, 0, 1, 1]
}

# Create DataFrame
df = pd.DataFrame(data)

# Feature and target separation
X = df.drop('Survived', axis=1)
y = df['Survived']

# Column types
categorical_cols = ['Sex', 'Embarked']
numerical_cols = ['Age', 'SibSp', 'Parch', 'Fare', 'Pclass']

# Preprocessing
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

# Model pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [10, 50],
    'classifier__max_depth': [3, 5, None]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X, y)

# Cross-validation evaluation
cv_scores = cross_val_score(grid_search.best_estimator_, X, y, cv=3)

print("Best Parameters:", grid_search.best_params_)
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy: {:.2f}".format(cv_scores.mean()))





Best Parameters: {'classifier__max_depth': 3, 'classifier__n_estimators': 50}
Cross-Validation Scores: [1. 1. 1.]
Mean CV Accuracy: 1.00
