In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
import pickle

In [None]:
TRAIN_CSV = '../Dataset/train_cleaned.csv'
TEST_CSV = '../Dataset/test_cleaned.csv'

train_df = pd.read_csv(TRAIN_CSV)
train_df.head()

X = train_df.drop(columns=['satisfaction'])
y = train_df['satisfaction']
num_cols = ['Age', 'Flight Distance', 'Departure Delay in Minutes']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols)
    ],
    remainder='passthrough'
)

svm_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', SVC(probability=True, random_state=42))
])

kf = KFold(n_splits=5, shuffle=True, random_state=42)

C_values = np.logspace(-2, 2, 5)  # [0.01, 0.1, 1, 10, 100]
gamma_values = ['scale']  # Try different gamma values
kernel = 'linear'  # Radial basis function kernel

mean_val_accuracies = []
mean_val_losses = []
mean_train_accuracies = []
mean_train_losses = []
params = []

In [None]:
for C in C_values:
    for gamma in gamma_values:
        print(f"\nTraining with C={C}, gamma={gamma}...")
        
        svm_pipeline.set_params(classifier__C=C, 
                               classifier__gamma=gamma,
                               classifier__kernel=kernel)
        
        scores = cross_validate(svm_pipeline, X, y, cv=kf,
                              scoring=['accuracy', 'neg_log_loss'],
                              return_train_score=True)
        
        val_accuracy = scores['test_accuracy'].mean()
        val_loss = -scores['test_neg_log_loss'].mean()
        train_accuracy = scores['train_accuracy'].mean()
        train_loss = -scores['train_neg_log_loss'].mean()
        
        mean_val_accuracies.append(val_accuracy)
        mean_val_losses.append(val_loss)
        mean_train_accuracies.append(train_accuracy)
        mean_train_losses.append(train_loss)
        params.append((C, gamma))
        
        print(f"Validation accuracy: {val_accuracy:.4f}, Train accuracy: {train_accuracy:.4f}")

# Find the best parameters
best_idx = np.argmax(mean_val_accuracies)
best_C, best_gamma = params[best_idx]
print(f"\nBest parameters: C={best_C}, gamma={best_gamma}")
print(f"Best validation accuracy: {mean_val_accuracies[best_idx]:.4f}")

fig, axs = plt.subplots(2, 2, figsize=(16, 12))
param_labels = [f"C={p[0]}, γ={p[1]}" for p in params]

# Validation loss
axs[0, 0].plot(range(len(params)), mean_val_losses, marker='o', color='blue')
axs[0, 0].set_xticks(range(len(params)))
axs[0, 0].set_xticklabels(param_labels, rotation=45)
axs[0, 0].set_ylabel('Validation Log Loss')
axs[0, 0].set_title('SVM: Parameters vs Validation Loss')
axs[0, 0].grid(True)

# Training loss
axs[0, 1].plot(range(len(params)), mean_train_losses, marker='o', color='blue')
axs[0, 1].set_xticks(range(len(params)))
axs[0, 1].set_xticklabels(param_labels, rotation=45)
axs[0, 1].set_ylabel('Training Log Loss')
axs[0, 1].set_title('SVM: Parameters vs Training Loss')
axs[0, 1].grid(True)

# Training accuracy
axs[1, 0].plot(range(len(params)), mean_train_accuracies, marker='o', color='green')
axs[1, 0].set_xticks(range(len(params)))
axs[1, 0].set_xticklabels(param_labels, rotation=45)
axs[1, 0].set_ylabel('Training Accuracy')
axs[1, 0].set_title('SVM: Parameters vs Training Accuracy')
axs[1, 0].grid(True)

# Validation accuracy
axs[1, 1].plot(range(len(params)), mean_val_accuracies, marker='o', color='red')
axs[1, 1].set_xticks(range(len(params)))
axs[1, 1].set_xticklabels(param_labels, rotation=45)
axs[1, 1].set_ylabel('Validation Accuracy')
axs[1, 1].set_title('SVM: Parameters vs Validation Accuracy')
axs[1, 1].grid(True)

plt.tight_layout()
plt.show()



Training with C=0.01, gamma=scale...


In [None]:

final_svm_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', SVC(C=best_C, gamma=best_gamma, kernel=kernel, probability=True, random_state=42))
])

final_svm_pipeline.fit(X, y)

y_pred_train = final_svm_pipeline.predict(X)
print("\nTraining Metrics:")
print("=================")
print("Classification Report:")
print(classification_report(y, y_pred_train))

cm_train = confusion_matrix(y, y_pred_train)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_train, annot=True, fmt='d', cmap='Blues')
plt.title('Training Data Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

In [None]:
test_df = pd.read_csv(TEST_CSV)
X_test = test_df.drop(columns=['satisfaction'])
y_test = test_df['satisfaction']

y_pred_test = final_svm_pipeline.predict(X_test)
print("\nTest Metrics:")
print("=============")
print("Classification Report:")
print(classification_report(y_test, y_pred_test))

cm_test = confusion_matrix(y_test, y_pred_test)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_test, annot=True, fmt='d', cmap='Blues')
plt.title('Test Data Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

In [None]:
pickle.dump(final_svm_pipeline, open('../Models/svm_model.pkl', 'wb'))