In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd


import warnings
warnings.filterwarnings('ignore')
from joblib import dump

majors = ['business', 'engineering', 'law', 'medical', 'others']

# List of columns to include in the analysis
columns_to_include = [
    'Age', 'CGPA', 'Depression_Score', 'Anxiety_Score', 'Semester_Credit_Load'
]
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100], # Inverse of regularization strength
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver': ['lbfgs', 'liblinear', 'saga'],
    'max_iter': [100, 200, 500]
}

def run_grid_search(X_train, y_train, cv_folds, param_grid):
    lr = LogisticRegression(random_state=42)

    # Grid Search with K-Fold cross-validation
    grid_search = GridSearchCV(estimator=lr, param_grid=param_grid, cv=cv_folds, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    return grid_search.best_estimator_, grid_search.best_params_, grid_search.best_score_

def run_logistic_regression_save(majors, columns_to_include, depression_threshold, k_folds=5, param_grid=None):
    results = []
    print(f"For depression threshold of {depression_threshold}, with {k_folds}-cross validation.")
    for major in majors:
        print(f"Processing major: {major}")

        data = pd.read_csv(f'../Data/clean_df_{major}.csv')
        data = data[[col for col in columns_to_include if col in data.columns]]

        # Convert Depression_Score to binary (1: Depressed, 0: Not Depressed)
        data['Depression_Score'] = data['Depression_Score'].apply(lambda x: 1 if x > depression_threshold else 0)

        y = data['Depression_Score']
        X = data.drop(columns=['Depression_Score'])
        X = X.apply(pd.to_numeric, errors='coerce').dropna()
        y = y[X.index]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

        scaler = MinMaxScaler()        # Scaling
        X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
        X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

        best_model, best_params, best_score = run_grid_search(X_train, y_train, k_folds, param_grid)

        # Save the model using joblib
        model_path = f"../Frontend/logistic_regression_{major}_model.pkl"
        dump(best_model, model_path)
        scaler_path =f"../Frontend/logistic_regression_{major}_scaler.pkl"
        dump(scaler, scaler_path)

        # Perform k-fold cross-validation for evaluation
        kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
        cross_val_scores = cross_val_score(best_model, X_train, y_train, cv=kf, scoring='accuracy')
        mean_cv_score = round(cross_val_scores.mean(), 5)

        results.append({
            'major': major,
            'best_params': best_params,
            'grid_search_score': best_score,
            'mean_cv_score': mean_cv_score
        })

    return results


def print_results(results):
    for result in results:
        print(f"Major: {result['major']}")
        print(f"Best Parameters: {result['best_params']}")
        print(f"Grid Search Score: {result['grid_search_score']}")
        print(f"Mean Cross-Validation Score: {result['mean_cv_score']}")
        print("-" * 50)

depression_threshold = 3
results = run_logistic_regression_save(majors, columns_to_include, depression_threshold, k_folds=5, param_grid=param_grid)
print_results(results)

For depression threshold of 3, with 5-cross validation.
Processing major: business
Processing major: engineering
Processing major: law
Processing major: medical
Processing major: others
Major: business
Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Grid Search Score: 0.8024561403508772
Mean Cross-Validation Score: 0.80244
--------------------------------------------------
Major: engineering
Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Grid Search Score: 0.8042242862057103
Mean Cross-Validation Score: 0.80419
--------------------------------------------------
Major: law
Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Grid Search Score: 0.7998190045248869
Mean Cross-Validation Score: 0.79978
--------------------------------------------------
Major: medical
Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Grid Search Score: 0.8054904051