In [6]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import numpy as np
import warnings
import time

warnings.filterwarnings('ignore')
start_time = time.time()
print("--- STARTING MODEL EXECUTION ---")


# Load data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
print("Stage 1: Data Loaded.")

# Combine data for consistent imputation across sets
full_data = pd.concat([train_data.drop('Survived', axis=1), test_data], ignore_index=True)
train_len = len(train_data)
Y_train = train_data['Survived']

# --- Imputation ---
full_data['Age'].fillna(full_data['Age'].median(), inplace=True)
full_data['Fare'].fillna(full_data['Fare'].median(), inplace=True)
full_data['Embarked'].fillna(full_data['Embarked'].mode()[0], inplace=True)

# --- Simple Feature Engineering (Family Size) ---
full_data['FamilySize'] = full_data['SibSp'] + full_data['Parch']
full_data['IsAlone'] = (full_data['FamilySize'] == 0).astype(int)

# Drop irrelevant features
X_full = full_data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

# Separate back into training and test sets
X_train = X_full.iloc[:train_len]
X_test = X_full.iloc[train_len:]
print("Stage 2: Feature Engineering and Imputation Complete.")


# Define column types for the ColumnTransformer
numerical_features = ['Age', 'Fare', 'FamilySize', 'SibSp', 'Parch']
categorical_features = ['Pclass', 'Sex', 'Embarked', 'IsAlone']

# Create preprocessor for automatic scaling and encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

# Create the full pipeline: Preprocessing -> SVM Classifier
svm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC(random_state=42))
])

# Refined parameter grid
param_grid_refined = {
    'classifier__kernel': ['rbf', 'poly'],
    'classifier__C': [0.5, 1, 2, 5],
    'classifier__gamma': [0.05, 0.1, 0.2, 0.5]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform the Grid Search
grid_search_refined = GridSearchCV(
    svm_pipeline,
    param_grid_refined,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=0
)

grid_search_refined.fit(X_train, Y_train)
print(" Stage 3: Grid Search CV Completed.")

# --- Evaluation Prints ---
best_model_refined = grid_search_refined.best_estimator_
Y_train_pred = best_model_refined.predict(X_train)

# Calculate metrics on the training set
train_accuracy = accuracy_score(Y_train, Y_train_pred)
train_f1 = f1_score(Y_train, Y_train_pred)
conf_matrix = confusion_matrix(Y_train, Y_train_pred)

print("\n" + "="*50)
print(f"BEST MODEL (Train Set) METRICS:")
print(f"  Accuracy: {train_accuracy:.4f}")
print(f"  F1 Score: {train_f1:.4f}")
print("  Confusion Matrix:")
print(conf_matrix)
print("="*50)

# Generate test predictions with the best model
Y_test_pred_refined = best_model_refined.predict(X_test)

# Create and save the new submission file
submission_refined = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': Y_test_pred_refined
})

submission_file_name = 'svm_gridsearch_refined_submission.csv'
submission_refined.to_csv(submission_file_name, index=False)

end_time = time.time()
print(f" Stage 4: Submission File Generated: {submission_file_name}")
print(f"--- EXECUTION FINISHED ")

--- STARTING MODEL EXECUTION ---
Stage 1: Data Loaded.
Stage 2: Feature Engineering and Imputation Complete.
 Stage 3: Grid Search CV Completed.

BEST MODEL (Train Set) METRICS:
  Accuracy: 0.8328
  F1 Score: 0.7704
  Confusion Matrix:
[[492  57]
 [ 92 250]]
 Stage 4: Submission File Generated: svm_gridsearch_refined_submission.csv
--- EXECUTION FINISHED 
