# Gaze Guard - Model Training

## 1. Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Assume a project structure where data loading utilities might exist
# Create dummy data loading functions for demonstration
def load_simulated_gaze_data(n_samples=1000, n_features=2, random_state=42):
    """Simulates loading gaze feature data."""
    np.random.seed(random_state)
    # Simulate features (e.g., normalized eye aspect ratio, head pose angle)
    X_looking = np.random.rand(n_samples // 2, n_features) * 0.5 + 0.3 # Centered around 0.5-0.8
    X_away = np.random.rand(n_samples // 2, n_features) * 0.4 # Centered around 0-0.4
    
    X = np.vstack((X_looking, X_away))
    y = np.array([1] * (n_samples // 2) + [0] * (n_samples // 2)) # 1: Looking, 0: Away
    
    # Shuffle data
    indices = np.arange(n_samples)
    np.random.shuffle(indices)
    X = X[indices]
    y = y[indices]
    
    # Create a DataFrame for better handling
    feature_names = [f'feature_{i+1}' for i in range(n_features)]
    df = pd.DataFrame(X, columns=feature_names)
    df['label'] = y
    
    print(f"Simulated data loaded: {len(df)} samples")
    print(f"Class distribution:\n{df['label'].value_counts()}")
    return df

def save_model(model, filename='gaze_classifier.joblib'):
    """Saves the trained model."""
    # Ensure models directory exists
    models_dir = 'models'
    os.makedirs(models_dir, exist_ok=True)
    filepath = os.path.join(models_dir, filename)
    joblib.dump(model, filepath)
    print(f"Model saved to {filepath}")

def load_model(filename='gaze_classifier.joblib'):
    """Loads a trained model."""
    filepath = os.path.join('models', filename)
    if os.path.exists(filepath):
        model = joblib.load(filepath)
        print(f"Model loaded from {filepath}")
        return model
    else:
        print(f"Model file not found at {filepath}")
        return None

## 2. Data Loading and Preparation

In [None]:
# Load the simulated data
data_df = load_simulated_gaze_data(n_samples=2000, n_features=2)

# Define features (X) and target (y)
X = data_df.drop('label', axis=1).values
y = data_df['label'].values

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nTraining set shape: X={X_train.shape}, y={y_train.shape}")
print(f"Testing set shape: X={X_test.shape}, y={y_test.shape}")

## 3. Model Definition

In [None]:
# Define the model pipeline: StandardScaler + SVM Classifier
# Using a pipeline ensures scaling is applied correctly during cross-validation
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(probability=True, random_state=42)) # probability=True for potential future use
])

## 4. Hyperparameter Tuning

In [None]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'svm__C': [0.1, 1, 10, 100],        # Regularization parameter
    'svm__gamma': ['scale', 'auto', 0.1, 1], # Kernel coefficient for 'rbf'
    'svm__kernel': ['rbf', 'linear']      # Type of kernel
}

# Setup GridSearchCV
# cv=5 means 5-fold cross-validation
# n_jobs=-1 uses all available CPU cores
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=1, scoring='accuracy')

# Perform the grid search on the training data
print("Starting Grid Search for hyperparameter tuning...")
grid_search.fit(X_train, y_train)

# Print the best parameters and best score found
print(f"\nBest parameters found: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

# Get the best estimator (the pipeline with the best parameters)
best_model = grid_search.best_estimator_

## 5. Model Training (Using Best Parameters)

In [None]:
# The best model is already trained by GridSearchCV on the full training set
# We can directly use 'best_model' for evaluation
print("Best model is already trained through GridSearchCV.")

## 6. Model Evaluation

In [None]:
# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nTest Set Accuracy: {accuracy:.4f}")

# Print classification report (precision, recall, f1-score)
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Away', 'Looking']))

# Calculate and display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

## 7. Results Visualization

In [None]:
# Visualize the Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Predicted Away', 'Predicted Looking'], 
            yticklabels=['Actual Away', 'Actual Looking'])
plt.title('Confusion Matrix')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()

# Visualize the decision boundary (only works for 2 features)
if X_train.shape[1] == 2:
    print("\nVisualizing Decision Boundary (requires 2 features)...")
    
    # Create a mesh grid for plotting
    scaler = best_model.named_steps['scaler']
    svm_model = best_model.named_steps['svm']
    
    X_scaled = scaler.transform(X) # Scale all data for plotting limits
    x_min, x_max = X_scaled[:, 0].min() - 1, X_scaled[:, 0].max() + 1
    y_min, y_max = X_scaled[:, 1].min() - 1, X_scaled[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
                         np.arange(y_min, y_max, 0.02))

    # Predict on the mesh grid (needs inverse transform if plotting original space)
    # Easier to plot in scaled space
    Z = svm_model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    plt.figure(figsize=(10, 8))
    plt.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8)

    # Plot the training points (scaled)
    X_train_scaled = scaler.transform(X_train)
    scatter = plt.scatter(X_train_scaled[:, 0], X_train_scaled[:, 1], c=y_train, cmap=plt.cm.coolwarm, edgecolors='k')
    plt.xlabel('Feature 1 (Scaled)')
    plt.ylabel('Feature 2 (Scaled)')
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.xticks(())
    plt.yticks(())
    plt.title('SVM Decision Boundary on Scaled Training Data')
    plt.legend(handles=scatter.legend_elements()[0], labels=['Away', 'Looking'])
    plt.show()
else:
    print("\nSkipping decision boundary visualization (requires exactly 2 features).")

## 8. Save the Trained Model

In [None]:
# Save the best model pipeline (including the scaler)
save_model(best_model, filename='gaze_svm_classifier.joblib')

# Example of loading the model back
loaded_model = load_model(filename='gaze_svm_classifier.joblib')
if loaded_model:
    print("\nModel loaded successfully. Verifying by predicting first 5 test samples:")
    print(f"Predictions: {loaded_model.predict(X_test[:5])}")
    print(f"Actual:      {y_test[:5]}")