# Logistic Regression

run grid search on Logistic regression with different thresholds.

In [17]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd


import warnings
warnings.filterwarnings('ignore')

In [28]:
majors = ['main', 'business', 'compsci', 'engineering', 'law', 'medical', 'others']

# List of columns to include in the analysis
columns_to_include = [
    'Age', 'CGPA', 'Stress_Level', 'Depression_Score', 
    'Anxiety_Score', 'Financial_Stress', 'Semester_Credit_Load'
]
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver': ['lbfgs', 'liblinear', 'saga'],
    'max_iter': [100, 200, 500]
}

In [29]:
def run_grid_search(X, y, cv_folds, param_grid=param_grid):

    # Initialize Logistic Regression model
    lr = LogisticRegression(random_state=42, multi_class='multinomial')

    # Grid Search with K-Fold cross-validation
    grid_search = GridSearchCV(estimator=lr, param_grid=param_grid, cv=cv_folds, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X, y)

    return grid_search.best_estimator_, grid_search.best_params_, grid_search.best_score_

def run_logistic_regression(majors, columns_to_include, depression_threshold, k_folds=5):
    results = []
    for major in majors:
        print(f"Processing major: {major}")

        # Load data
        data = pd.read_csv(f'../Data/clean_df_{major}.csv')

        # Select only the columns to include
        data = data[[col for col in columns_to_include if col in data.columns]]

        # Convert Depression_Score to binary (1: Depressed, 0: Not Depressed)
        data['Depression_Score'] = data['Depression_Score'].apply(lambda x: 1 if x > depression_threshold else 0)

        # Split into features (X) and target (y)
        y = data['Depression_Score']
        X = data.drop(columns=['Depression_Score'])
        X = X.apply(pd.to_numeric, errors='coerce').dropna()
        y = y[X.index]

        # Scaling
        scaler = MinMaxScaler()
        X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

        # Run grid search
        best_model, best_params, best_score = run_grid_search(X, y, k_folds)

        # Perform k-fold cross-validation for evaluation
        kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
        cross_val_scores = cross_val_score(best_model, X, y, cv=kf, scoring='accuracy')
        mean_cv_score = round(cross_val_scores.mean(), 5)

        # Append results
        results.append({
            'major': major,
            'best_params': best_params,
            'grid_search_score': best_score,
            'mean_cv_score': mean_cv_score
        })

    return results



## Threshold of 4

5 = Depressed, 1,2,3,4 = Not Depressed

In [None]:
results = run_logistic_regression(majors, columns_to_include, 4, k_folds=5)

In [31]:
for result in results:
    print(f"Major: {result['major']}")
    print(f"Best Parameters: {result['best_params']}")
    print(f"Grid Search Score: {result['grid_search_score']}")
    print(f"Mean Cross-Validation Score: {result['mean_cv_score']}")
    print("-" * 50)

Major: main
Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}
Grid Search Score: 0.8797712651894211
Mean Cross-Validation Score: 0.87977
--------------------------------------------------
Major: business
Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}
Grid Search Score: 0.8951048951048952
Mean Cross-Validation Score: 0.8951
--------------------------------------------------
Major: compsci
Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}
Grid Search Score: 0.7416834050693448
Mean Cross-Validation Score: 0.74173
--------------------------------------------------
Major: engineering
Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}
Grid Search Score: 0.9072177613970427
Mean Cross-Validation Score: 0.90728
--------------------------------------------------
Major: law
Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}
Grid Search Score: 

## Threshold of 3

4,5 = Depressed, 1,2,3 = Not Depressed

In [None]:
results = run_logistic_regression(majors, columns_to_include, 3, k_folds=5)

In [33]:
for result in results:
    print(f"Major: {result['major']}")
    print(f"Best Parameters: {result['best_params']}")
    print(f"Grid Search Score: {result['grid_search_score']}")
    print(f"Mean Cross-Validation Score: {result['mean_cv_score']}")
    print("-" * 50)

Major: main
Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}
Grid Search Score: 0.7542530378842031
Mean Cross-Validation Score: 0.75425
--------------------------------------------------
Major: business
Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}
Grid Search Score: 0.8027972027972028
Mean Cross-Validation Score: 0.8028
--------------------------------------------------
Major: compsci
Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
Grid Search Score: 0.527403156384505
Mean Cross-Validation Score: 0.52152
--------------------------------------------------
Major: engineering
Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}
Grid Search Score: 0.8041244350840244
Mean Cross-Validation Score: 0.80418
--------------------------------------------------
Major: law
Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}
Grid Search Score: 

## Threshold of 2

3,4,5 = Depressed, 1,2 = Not Depressed

In [None]:
results = run_logistic_regression(majors, columns_to_include, 2, k_folds=5)

In [35]:
for result in results:
    print(f"Major: {result['major']}")
    print(f"Best Parameters: {result['best_params']}")
    print(f"Grid Search Score: {result['grid_search_score']}")
    print(f"Mean Cross-Validation Score: {result['mean_cv_score']}")
    print("-" * 50)

Major: main
Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}
Grid Search Score: 0.5562544674767691
Mean Cross-Validation Score: 0.55625
--------------------------------------------------
Major: business
Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}
Grid Search Score: 0.5958041958041959
Mean Cross-Validation Score: 0.5958
--------------------------------------------------
Major: compsci
Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}
Grid Search Score: 0.7162410329985653
Mean Cross-Validation Score: 0.71621
--------------------------------------------------
Major: engineering
Best Parameters: {'C': 10, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}
Grid Search Score: 0.6054407441533939
Mean Cross-Validation Score: 0.60645
--------------------------------------------------
Major: law
Best Parameters: {'C': 1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
Grid Search Score: 0.59