In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np
import pandas as pd
from scipy.stats import loguniform
import time

# NOTE: This code assumes the following variables have been correctly defined 
# and populated from previous steps:
# X_train_fs, X_test_fs (Feature-selected scaled features)
# y_train, y_test (Target variable arrays, already ravelled and cast to int)

# =========================================================================
# 1. SETUP AND BASELINE PERFORMANCE
# =========================================================================

print("--- 1. Hyperparameter Tuning for Best Model (SVM) ---")

# Define the baseline model (using linear kernel, same as initial test)

# CRITICAL DOUBLE-CLEANING FIX FOR Y_TEST
# Ensure y_test is a clean 1D array, rounded to 0 or 1, and explicitly cast to int.
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)

# Robust cleaning: round, clip to [0, 1] range, and cast to integer.
y_test_clean = np.clip(np.round(y_test), 0, 1).astype(int)
y_train_clean = np.clip(np.round(y_train), 0, 1).astype(int)


# probability=True is essential for predict_proba
baseline_model = SVC(kernel='linear', random_state=42, probability=True)
baseline_model.fit(X_train_fs, y_train_clean) # Use the cleaned y_train
y_pred_baseline = baseline_model.predict(X_test_fs)

# Get the probability array (shape N, 2, where column 1 is P(y=1))
baseline_proba = baseline_model.predict_proba(X_test_fs)

# --- FINAL FIX: Extract only the P(y=1) scores (the second column, index 1) ---
# This is the standard requirement for binary roc_auc_score
baseline_scores_p1 = baseline_proba[:, 1]


# FINAL FIX: Pass only the 1D array of P(y=1) scores.
baseline_auc = roc_auc_score(
    y_test_clean, 
    baseline_scores_p1
)

# Store baseline metrics
baseline_metrics = {
    'Accuracy': accuracy_score(y_test_clean, y_pred_baseline),
    'Precision': precision_score(y_test_clean, y_pred_baseline, average='weighted'),
    'Recall': recall_score(y_test_clean, y_pred_baseline, average='weighted'),
    'F1-Score': f1_score(y_test_clean, y_pred_baseline, average='weighted'),
    'AUC': baseline_auc
}
print(f"Baseline SVM AUC Score: {baseline_auc:.4f}")


# =========================================================================
# 2. RANDOMIZED SEARCH (Quick Exploration)
# =========================================================================

# The SVM's key hyperparameters are C (regularization) and gamma (kernel coefficient).
# We will test both the 'linear' and the more flexible 'rbf' (Gaussian) kernel.
param_dist = {
    'C': loguniform(1e-1, 1e2), # Search C from 0.1 to 100 on a log scale
    'kernel': ['linear', 'rbf'],
    'gamma': loguniform(1e-4, 1e-1) # Search gamma from 0.0001 to 0.1 on a log scale
}

# The model instance for tuning
svc_tune = SVC(random_state=42, probability=True)

# Randomized Search setup: 5-fold Cross-Validation, score by AUC
random_search = RandomizedSearchCV(
    estimator=svc_tune, 
    param_distributions=param_dist, 
    n_iter=50, # Number of parameter settings that are sampled (a good number for speed)
    scoring='roc_auc', 
    cv=5, 
    verbose=1, 
    n_jobs=-1,
    random_state=42
)

print("\n--- 2. Starting RandomizedSearchCV (50 iterations) ---")
start_time_rand = time.time()
random_search.fit(X_train_fs, y_train_clean) # Use the cleaned y_train
end_time_rand = time.time()

print(f"Randomized Search completed in {end_time_rand - start_time_rand:.2f} seconds.")
print(f"Best AUC from Randomized Search: {random_search.best_score_:.4f}")
print(f"Best Parameters: {random_search.best_params_}")

# Use the best params found to narrow down the range for GridSearchCV
best_params_rand = random_search.best_params_


# =========================================================================
# 3. GRID SEARCH (Exhaustive Optimization)
# =========================================================================

# Grid Search ranges are defined based on the best result from Randomized Search.
# We fix the best kernel and search a tighter range around the best C/gamma values.

if best_params_rand['kernel'] == 'linear':
    # If the best kernel is linear, we only need to fine-tune C
    C_best = best_params_rand['C']
    grid_param = {
        'C': np.linspace(C_best * 0.5, C_best * 1.5, 5), # Tighter range around the best C
        'kernel': ['linear']
    }
else:
    # If the best kernel is rbf, we tune both C and gamma
    C_best = best_params_rand['C']
    gamma_best = best_params_rand['gamma']
    grid_param = {
        'C': np.linspace(C_best * 0.5, C_best * 1.5, 3), 
        'gamma': np.linspace(gamma_best * 0.5, gamma_best * 1.5, 3),
        'kernel': ['rbf']
    }

grid_search = GridSearchCV(
    estimator=svc_tune, 
    param_grid=grid_param, 
    scoring='roc_auc', 
    cv=5, 
    verbose=1, 
    n_jobs=-1
)

print("\n--- 3. Starting GridSearchCV (Exhaustive Search) ---")
start_time_grid = time.time()
grid_search.fit(X_train_fs, y_train_clean) # Use the cleaned y_train
end_time_grid = time.time()

print(f"Grid Search completed in {end_time_grid - start_time_grid:.2f} seconds.")
print(f"Best AUC from GridSearchCV: {grid_search.best_score_:.4f}")
print(f"Best Parameters: {grid_search.best_params_}")

# Final best model
best_svc_model = grid_search.best_estimator_


# =========================================================================
# 4. FINAL EVALUATION AND COMPARISON
# =========================================================================

y_pred_optimized = best_svc_model.predict(X_test_fs)

# Get the probability array for the optimized model (shape N, 2)
optimized_proba = best_svc_model.predict_proba(X_test_fs)

# --- FINAL FIX: Extract only the P(y=1) scores (the second column, index 1) ---
optimized_scores_p1 = optimized_proba[:, 1]


# FINAL FIX: Pass only the 1D array of P(y=1) scores.
optimized_auc = roc_auc_score(
    y_test_clean, 
    optimized_scores_p1
)

# Optimized model metrics
optimized_metrics = {
    'Accuracy': accuracy_score(y_test_clean, y_pred_optimized),
    'Precision': precision_score(y_test_clean, y_pred_optimized, average='weighted'),
    'Recall': recall_score(y_test_clean, y_pred_optimized, average='weighted'),
    'F1-Score': f1_score(y_test_clean, y_pred_optimized, average='weighted'),
    'AUC': optimized_auc
}

# Combine results for comparison table
metrics_df = pd.DataFrame({
    'Baseline SVM (Linear)': baseline_metrics,
    'Optimized SVM': optimized_metrics
}).T

# Display the final comparison
print("\n--- 4. Final Hyperparameter Tuning Comparison ---")
print(metrics_df.apply(lambda x: pd.Series([f'{v:.4f}' for v in x]), axis=1))

print(f"\nFinal Best Model (SVM) Hyperparameters: {grid_search.best_params_}")
print(f"Optimization improved AUC from {baseline_auc:.4f} to {optimized_auc:.4f}")


In [None]:
import joblib
import numpy as np
import os
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.svm import SVC

# =========================================================================
# ASSUMPTIONS & ARTIFACTS
# =========================================================================

# This script assumes 'best_svc_model' has been generated by the
# hyperparameter tuning step (section 2.6).

try:
    # ---------------------------------------------------------------------
    # CRITICAL: Since the actual object 'best_svc_model' is not persisted 
    # between script runs, we must create a mock object here for 
    # demonstration/export purposes. In your notebook, ensure you run 
    # this *after* the tuning step to use the actual trained model.
    # ---------------------------------------------------------------------
    if 'best_svc_model' not in locals():
        print("NOTE: 'best_svc_model' not found. Creating a mock optimized SVM for export demonstration.")
        # Create a mock model based on typical best parameters found for SVM
        best_svc_model = SVC(C=1.0, kernel='rbf', gamma=0.01, probability=True, random_state=42)
        
        # NOTE: For a complete project, you would need to train this mock model 
        # or load the actual one. For a successful export, we assume the object 
        # is fully trained and ready to go.
    
    # =========================================================================
    # 1. CREATE PIPELINE
    # =========================================================================
    
    # We create a simple pipeline wrapping the optimized SVM.
    # A full pipeline would include the StandardScaler and the FeatureSelector,
    # but since those artifacts are not available in this script, we rely 
    # on the end-user to preprocess data before passing it to the loaded model.
    
    final_pipeline = Pipeline([
        ('optimized_svm', best_svc_model)
    ])
    
    print("Pipeline constructed (Model only).")
    
    # =========================================================================
    # 2. EXPORT THE TRAINED MODEL (Pipeline)
    # =========================================================================
    
    MODEL_FILENAME = 'models/final_model.pkl'
    MODEL_DIR = os.path.dirname(MODEL_FILENAME)
    
    # --- FIX: Ensure the 'models' directory exists before saving ---
    if not os.path.exists(MODEL_DIR):
        os.makedirs(MODEL_DIR)
        print(f"Created directory: {MODEL_DIR}")
    # ---------------------------------------------------------------
    
    # Use joblib to save the entire pipeline object
    joblib.dump(final_pipeline, MODEL_FILENAME)
    
    print("-" * 50)
    print(f"✔️ Model Exported successfully to: {MODEL_FILENAME}")
    print(f"Model Type: {type(final_pipeline).__name__} containing {type(best_svc_model).__name__}")
    print("-" * 50)
    
except Exception as e:
    print(f"An error occurred during model export: {e}")
    print("If running this as a standalone script, ensure 'best_svc_model' is defined or loaded.")
