# Logistic Regression

run grid search on Logistic regression with different thresholds.

In [39]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd


import warnings
warnings.filterwarnings('ignore')

In [40]:
majors = ['main', 'business', 'compsci', 'engineering', 'law', 'medical', 'others']

# List of columns to include in the analysis
columns_to_include = [
    'Age', 'CGPA', 'Stress_Level', 'Depression_Score', 
    'Anxiety_Score', 'Financial_Stress', 'Semester_Credit_Load'
]
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100], # Inverse of regularization strength
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver': ['lbfgs', 'liblinear', 'saga'],
    'max_iter': [100, 200, 500]
}

In [60]:
def run_grid_search(X_train, y_train, cv_folds, param_grid):
    # Initialize Logistic Regression model
    lr = LogisticRegression(random_state=42)

    # Grid Search with K-Fold cross-validation
    grid_search = GridSearchCV(estimator=lr, param_grid=param_grid, cv=cv_folds, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    return grid_search.best_estimator_, grid_search.best_params_, grid_search.best_score_

def run_logistic_regression(majors, columns_to_include, depression_threshold, k_folds=5, param_grid=param_grid):
    results = []
    print(f"For depression threshold of {depression_threshold}, with {k_folds}-cross validation.")
    for major in majors:
        print(f"Processing major: {major}")

        # Load data
        data = pd.read_csv(f'../Data/clean_df_{major}.csv')

        # Select only the columns to include
        data = data[[col for col in columns_to_include if col in data.columns]]

        # Convert Depression_Score to binary (1: Depressed, 0: Not Depressed)
        data['Depression_Score'] = data['Depression_Score'].apply(lambda x: 1 if x > depression_threshold else 0)

        # Split into features (X) and target (y)
        y = data['Depression_Score']
        X = data.drop(columns=['Depression_Score'])
        X = X.apply(pd.to_numeric, errors='coerce').dropna()
        y = y[X.index]

        # Split into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

        # Scaling
        scaler = MinMaxScaler()
        X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
        X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

        # Run grid search
        best_model, best_params, best_score = run_grid_search(X_train, y_train, k_folds, param_grid)

        # Perform k-fold cross-validation for evaluation
        kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
        cross_val_scores = cross_val_score(best_model, X_train, y_train, cv=kf, scoring='accuracy')
        mean_cv_score = round(cross_val_scores.mean(), 5)

        # Append results
        results.append({
            'major': major,
            'best_params': best_params,
            'grid_search_score': best_score,
            'mean_cv_score': mean_cv_score
        })

    return results


## Threshold of 4

5 = Depressed, 1,2,3,4 = Not Depressed

In [61]:
results = run_logistic_regression(majors, columns_to_include, 4, k_folds=5)

For depression threshold of 4, with 5-cross validation.
Processing major: main


Processing major: business
Processing major: compsci
Processing major: engineering
Processing major: law
Processing major: medical
Processing major: others


In [62]:
for result in results:
    print(f"Major: {result['major']}")
    print(f"Best Parameters: {result['best_params']}")
    print(f"Grid Search Score: {result['grid_search_score']}")
    print(f"Mean Cross-Validation Score: {result['mean_cv_score']}")
    print("-" * 50)

Major: main
Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Grid Search Score: 0.8797355738542066
Mean Cross-Validation Score: 0.87974
--------------------------------------------------
Major: business
Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Grid Search Score: 0.8951029748283753
Mean Cross-Validation Score: 0.89504
--------------------------------------------------
Major: compsci
Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Grid Search Score: 0.7417402364207691
Mean Cross-Validation Score: 0.74176
--------------------------------------------------
Major: engineering
Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Grid Search Score: 0.9073890608875128
Mean Cross-Validation Score: 0.9074
--------------------------------------------------
Major: law
Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblin

## Threshold of 3

4,5 = Depressed, 1,2,3 = Not Depressed

In [63]:
results = run_logistic_regression(majors, columns_to_include, 3, k_folds=5)

For depression threshold of 3, with 5-cross validation.
Processing major: main
Processing major: business
Processing major: compsci
Processing major: engineering
Processing major: law
Processing major: medical
Processing major: others


In [64]:
for result in results:
    print(f"Major: {result['major']}")
    print(f"Best Parameters: {result['best_params']}")
    print(f"Grid Search Score: {result['grid_search_score']}")
    print(f"Mean Cross-Validation Score: {result['mean_cv_score']}")
    print("-" * 50)

Major: main
Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Grid Search Score: 0.7542887463296312
Mean Cross-Validation Score: 0.75429
--------------------------------------------------
Major: business
Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Grid Search Score: 0.8024561403508772
Mean Cross-Validation Score: 0.80244
--------------------------------------------------
Major: compsci
Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
Grid Search Score: 0.5238665270088284
Mean Cross-Validation Score: 0.52265
--------------------------------------------------
Major: engineering
Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Grid Search Score: 0.8042242862057103
Mean Cross-Validation Score: 0.80419
--------------------------------------------------
Major: law
Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear

## Threshold of 2

3,4,5 = Depressed, 1,2 = Not Depressed

In [65]:
results = run_logistic_regression(majors, columns_to_include, 2, k_folds=5)

For depression threshold of 2, with 5-cross validation.
Processing major: main
Processing major: business
Processing major: compsci
Processing major: engineering
Processing major: law
Processing major: medical
Processing major: others


In [66]:
for result in results:
    print(f"Major: {result['major']}")
    print(f"Best Parameters: {result['best_params']}")
    print(f"Grid Search Score: {result['grid_search_score']}")
    print(f"Mean Cross-Validation Score: {result['mean_cv_score']}")
    print("-" * 50)

Major: main
Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Grid Search Score: 0.5562902144772118
Mean Cross-Validation Score: 0.55629
--------------------------------------------------
Major: business
Best Parameters: {'C': 1, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Grid Search Score: 0.5979099923722349
Mean Cross-Validation Score: 0.59092
--------------------------------------------------
Major: compsci
Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Grid Search Score: 0.7160407002843033
Mean Cross-Validation Score: 0.71604
--------------------------------------------------
Major: engineering
Best Parameters: {'C': 0.01, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Grid Search Score: 0.6049260405916753
Mean Cross-Validation Score: 0.60497
--------------------------------------------------
Major: law
Best Parameters: {'C': 1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
Gri