In [10]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [15]:
features = ['Age', 'CGPA', 'Semester_Credit_Load', 'Anxiety_Score', 'Financial_Stress', 'Stress_Level']
majors = ['business','compsci','engineering','law','medical','others']
depression_threshold = 4

for major in majors:
    print("Major: ", major)
    data = pd.read_csv(f'../Data/clean_df_{major}.csv')
    data.drop(['Course'], axis=1, inplace=True)
    data['Depression_Score'] = data['Depression_Score'].apply(lambda x: 1 if x > depression_threshold else 0)

    X = data[features]
    y = data['Depression_Score']

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Define the SGDClassifier
    sgd = SGDClassifier(random_state=42)

    # Define the hyperparameter grid to search over
    param_grid = {
        'loss': ['hinge', 'log_loss', 'squared_hinge', 'perceptron'],  # Types of loss functions
        'penalty': ['l2', 'l1', 'elasticnet'],  # Regularization types
        'alpha': [0.0001, 0.001, 0.01],  # Regularization strength
        'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],  # Learning rate schedule
        'eta0': [0.01, 0.1, 1.0],
        'max_iter': [1000, 2000, 3000],  # Max iterations
    }

    # Create the GridSearchCV object
    grid_search = GridSearchCV(estimator=sgd, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

    # Fit the grid search
    grid_search.fit(X_train, y_train)

    # Print the best parameters and the corresponding score
    print("Best parameters found: ", grid_search.best_params_)
    print("Best cross-validation accuracy: ", grid_search.best_score_)

    # Evaluate the best model on the test set
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    print("Test set accuracy: ", accuracy_score(y_test, y_pred))
    print()


Major:  business
Best parameters found:  {'alpha': 0.0001, 'eta0': 0.01, 'learning_rate': 'constant', 'loss': 'hinge', 'max_iter': 1000, 'penalty': 'l2'}
Best cross-validation accuracy:  0.8933638443935926
Test set accuracy:  0.9020979020979021

Major:  compsci
Best parameters found:  {'alpha': 0.0001, 'eta0': 0.01, 'learning_rate': 'constant', 'loss': 'hinge', 'max_iter': 1000, 'penalty': 'l2'}
Best cross-validation accuracy:  0.747860242406105
Test set accuracy:  0.7170731707317073

Major:  engineering
Best parameters found:  {'alpha': 0.0001, 'eta0': 0.01, 'learning_rate': 'constant', 'loss': 'hinge', 'max_iter': 1000, 'penalty': 'l2'}
Best cross-validation accuracy:  0.9155968352253181
Test set accuracy:  0.8738317757009346

Major:  law
Best parameters found:  {'alpha': 0.0001, 'eta0': 0.01, 'learning_rate': 'constant', 'loss': 'hinge', 'max_iter': 1000, 'penalty': 'l2'}
Best cross-validation accuracy:  0.9021760592348829
Test set accuracy:  0.9097472924187726

Major:  medical
Best