In [2]:
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, train_test_split
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [3]:
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto']
}

In [4]:
def run_grid_search_svc(X_train, y_train, cv_folds, param_grid):
    svc = SVC(random_state=42)

    grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, cv=cv_folds, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    return grid_search.best_estimator_, grid_search.best_params_, grid_search.best_score_

def run_svc(majors, columns_to_include, depression_threshold, k_folds=5, param_grid=param_grid):
    results = []
    print(f"For depression threshold of {depression_threshold}, with {k_folds}-cross validation.")
    for major in majors:
        print(f"Processing major: {major}")

        # Load data
        data = pd.read_csv(f'../Data/clean_df_{major}.csv')

        # Select only the columns to include
        data = data[[col for col in columns_to_include if col in data.columns]]

        # Convert Depression_Score to binary (1: Depressed, 0: Not Depressed)
        data['Depression_Score'] = data['Depression_Score'].apply(lambda x: 1 if x > depression_threshold else 0)

        # Split into features (X) and target (y)
        y = data['Depression_Score']
        X = data.drop(columns=['Depression_Score'])
        X = X.apply(pd.to_numeric, errors='coerce').dropna()
        y = y[X.index]

        # Split into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

        # Scaling
        scaler = MinMaxScaler()
        X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
        X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

        # Run grid search
        best_model, best_params, best_score = run_grid_search_svc(X_train, y_train, k_folds, param_grid)

        # Perform k-fold cross-validation for evaluation
        kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
        cross_val_scores = cross_val_score(best_model, X_train, y_train, cv=kf, scoring='accuracy')
        mean_cv_score = round(cross_val_scores.mean(), 5)

        # Append results
        results.append({
            'major': major,
            'best_params': best_params,
            'grid_search_score': best_score,
            'mean_cv_score': mean_cv_score
        })

    return results

def print_results(results):
    print("\nModel Performance Summary:")
    for result in results:
        print(f"Major: {result['major']}")
        print(f"Best Parameters: {result['best_params']}")
        print(f"Grid Search Score: {result['grid_search_score']:.5f}")
        print(f"Mean Cross-Validation Score: {result['mean_cv_score']:.5f}")
        print("-" * 50)

In [20]:
majors = ['main', 'business', 'compsci', 'engineering', 'law', 'medical', 'others']

columns_to_include = [
    'Age', 'CGPA', 'Stress_Level', 'Depression_Score', 
    'Anxiety_Score', 'Financial_Stress', 'Semester_Credit_Load'
]

depression_threshold = 4
k_folds = 5
results = run_svc(majors, columns_to_include, depression_threshold, k_folds)
print_results(results)


For depression threshold of 4, with 5-cross validation.
Processing major: main
Processing major: business
Processing major: compsci
Processing major: engineering
Processing major: law
Processing major: medical
Processing major: others

Model Performance Summary:
Major: main
Best Parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Grid Search Score: 0.87974
Mean Cross-Validation Score: 0.87974
--------------------------------------------------
Major: business
Best Parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Grid Search Score: 0.89510
Mean Cross-Validation Score: 0.89504
--------------------------------------------------
Major: compsci
Best Parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Grid Search Score: 0.74174
Mean Cross-Validation Score: 0.74176
--------------------------------------------------
Major: engineering
Best Parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Grid Search Score: 0.90739
Mean Cross-Validation Score: 0.90740
--

In [21]:
majors = ['main', 'business', 'compsci', 'engineering', 'law', 'medical', 'others']

columns_to_include = [
    'Age', 'CGPA', 'Stress_Level', 'Depression_Score', 
    'Anxiety_Score', 'Financial_Stress', 'Semester_Credit_Load'
]

depression_threshold = 3
k_folds = 5
results = run_svc(majors, columns_to_include, depression_threshold, k_folds)
print_results(results)


For depression threshold of 3, with 5-cross validation.
Processing major: main
Processing major: business
Processing major: compsci
Processing major: engineering
Processing major: law
Processing major: medical
Processing major: others

Model Performance Summary:
Major: main
Best Parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Grid Search Score: 0.75429
Mean Cross-Validation Score: 0.75429
--------------------------------------------------
Major: business
Best Parameters: {'C': 10, 'gamma': 'auto', 'kernel': 'sigmoid'}
Grid Search Score: 0.80420
Mean Cross-Validation Score: 0.80244
--------------------------------------------------
Major: compsci
Best Parameters: {'C': 10, 'gamma': 'auto', 'kernel': 'poly'}
Grid Search Score: 0.55452
Mean Cross-Validation Score: 0.52625
--------------------------------------------------
Major: engineering
Best Parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Grid Search Score: 0.80422
Mean Cross-Validation Score: 0.80419
-------

# Alternate with only 4 features instead of 6

columns_to_include = ['Age', 'CGPA', 'Semester_Credit_Load', 'Anxiety_Score']

In [22]:
majors = ['main', 'business', 'compsci', 'engineering', 'law', 'medical', 'others']

columns_to_include = ['Age', 'CGPA', 'Semester_Credit_Load', 'Anxiety_Score','Depression_Score']

depression_threshold = 4
k_folds = 5
results = run_svc(majors, columns_to_include, depression_threshold, k_folds)
print_results(results)


For depression threshold of 4, with 5-cross validation.
Processing major: main
Processing major: business
Processing major: compsci
Processing major: engineering
Processing major: law
Processing major: medical
Processing major: others

Model Performance Summary:
Major: main
Best Parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Grid Search Score: 0.87974
Mean Cross-Validation Score: 0.87974
--------------------------------------------------
Major: business
Best Parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Grid Search Score: 0.89510
Mean Cross-Validation Score: 0.89504
--------------------------------------------------
Major: compsci
Best Parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Grid Search Score: 0.74174
Mean Cross-Validation Score: 0.74176
--------------------------------------------------
Major: engineering
Best Parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Grid Search Score: 0.90739
Mean Cross-Validation Score: 0.90740
--

In [5]:
majors = ['business', 'engineering', 'law', 'medical', 'others']

columns_to_include = ['Age', 'CGPA', 'Semester_Credit_Load', 'Depression_Score']

depression_threshold = 3
k_folds = 5
results = run_svc(majors, columns_to_include, depression_threshold, k_folds)
print_results(results)


For depression threshold of 3, with 5-cross validation.
Processing major: business
Processing major: engineering
Processing major: law
Processing major: medical
Processing major: others

Model Performance Summary:
Major: business
Best Parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Grid Search Score: 0.80246
Mean Cross-Validation Score: 0.80244
--------------------------------------------------
Major: engineering
Best Parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Grid Search Score: 0.80422
Mean Cross-Validation Score: 0.80419
--------------------------------------------------
Major: law
Best Parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Grid Search Score: 0.79982
Mean Cross-Validation Score: 0.79978
--------------------------------------------------
Major: medical
Best Parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Grid Search Score: 0.80549
Mean Cross-Validation Score: 0.80549
--------------------------------------------------
M

In [19]:
majors = ['business', 'compsci', 'engineering']

columns_to_include = [
    'Age', 'CGPA', 'Stress_Level', 'Depression_Score', 
    'Anxiety_Score', 'Financial_Stress', 'Semester_Credit_Load'
]

depression_threshold = 3
k_folds = 5
results = run_svc(majors, columns_to_include, depression_threshold, k_folds)
print_results(results)


For depression threshold of 3, with 5-cross validation.
Processing major: business
Processing major: compsci
Processing major: engineering

Model Performance Summary:
Major: business
Best Parameters: {'C': 10, 'gamma': 'auto', 'kernel': 'sigmoid'}
Grid Search Score: 0.80420
Mean Cross-Validation Score: 0.80244
--------------------------------------------------
Major: compsci
Best Parameters: {'C': 10, 'gamma': 'auto', 'kernel': 'poly'}
Grid Search Score: 0.55452
Mean Cross-Validation Score: 0.52625
--------------------------------------------------
Major: engineering
Best Parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Grid Search Score: 0.80422
Mean Cross-Validation Score: 0.80419
--------------------------------------------------
