# **TASK 3: Train a Model with Cross Validation**

In [None]:
# Step 1: Import libraries
import pandas as pd
from sklearn.model_selection import cross_validate, KFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Step 2: Load the Titanic Dataset
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/train_data.csv')

In [None]:
df.columns

Index(['Unnamed: 0', 'PassengerId', 'Survived', 'Sex', 'Age', 'Fare',
       'Pclass_1', 'Pclass_2', 'Pclass_3', 'Family_size', 'Title_1', 'Title_2',
       'Title_3', 'Title_4', 'Emb_1', 'Emb_2', 'Emb_3'],
      dtype='object')

In [None]:
# Step 3: Select features and target variable
X = df.iloc[:, :-1]   # All columns except last
y = df.iloc[:, -1]

In [None]:
# Step 4: Identify numeric and categorical columns
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

In [None]:
# Step 5: Create preprocessing pipelines
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),       # Handle missing numeric
    ("scaler", StandardScaler())                         # Normalize values
])
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")), # Fill missing categorical
    ("encoder", OneHotEncoder(handle_unknown="ignore"))   # Encode categorical
])
preprocessor = ColumnTransformer([
    ("num", num_pipeline, numeric_cols),
    ("cat", cat_pipeline, categorical_cols)
])

In [None]:
# Step 6: Build full pipeline with classifier
model_pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("model", DecisionTreeClassifier())   # Use Decision Tree
])

In [None]:
# Step 7: Define k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
# Step 8: Perform cross-validation and collect scores
scores = cross_validate(
    model_pipeline, X, y,
    cv=kf,
    scoring=["accuracy", "precision", "recall"],
    return_train_score=False
)

In [None]:
# Step 99: Display average results
print("\nCross-Validation Results (5-fold):")
print(f"Accuracy : {scores['test_accuracy'].mean():.3f}")
print(f"Precision: {scores['test_precision'].mean():.3f}")
print(f"Recall   : {scores['test_recall'].mean():.3f}")


Cross-Validation Results (5-fold):
Accuracy : 0.996
Precision: 0.998
Recall   : 0.997
