In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# 1. Load dataset
df = pd.read_csv('mental_health_dataset.csv')  

# 2. Handle categorical variables
categorical_cols = ['gender', 'employment_status', 'work_environment',
                    'mental_health_history', 'seeks_treatment']

# Encode categorical variables using LabelEncoder
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Save encoders for future use

# Encode the target column
target_encoder = LabelEncoder()
df['mental_health_risk'] = target_encoder.fit_transform(df['mental_health_risk'])

# 3. Split features and target
X = df.drop('mental_health_risk', axis=1)
y = df['mental_health_risk']

# 4. Scale numerical features
numerical_cols = ['age', 'stress_level', 'sleep_hours',
                  'physical_activity_days', 'depression_score',
                  'anxiety_score', 'social_support_score', 'productivity_score']

scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

# 5. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Model training
model = RandomForestClassifier(n_estimators=150, random_state=42)
model.fit(X_train, y_train)

# 7. Model evaluation
y_pred = model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# 8. Save model and encoders
joblib.dump(model, 'mental_health_risk_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(label_encoders, 'encoders.pkl')
joblib.dump(target_encoder, 'target_encoder.pkl')
X_encoded = pd.get_dummies(X)
joblib.dump(X_encoded.columns.tolist(), "model_features.pkl")

# 9. Predict for a new input
def predict_mental_health(input_data: dict):
    input_df = pd.DataFrame([input_data])

    # Encode categorical features
    for col in categorical_cols:
        input_df[col] = label_encoders[col].transform([input_df[col][0]])

    # Scale numeric features
    input_df[numerical_cols] = scaler.transform(input_df[numerical_cols])

    # Predict
    prediction = model.predict(input_df)[0]
    prediction_label = target_encoder.inverse_transform([prediction])[0]
    prediction_proba = model.predict_proba(input_df).max() * 100  # Confidence %

    return prediction_label, round(prediction_proba, 2)

# Example usage
sample_input = {
    'age': 19,
    'gender': 'Female',
    'employment_status': 'Unemployed',
    'work_environment': 'On-site',
    'mental_health_history': 'No',
    'seeks_treatment': 'No',
    'stress_level': 1,
    'sleep_hours': 8.0,
    'physical_activity_days': 5,
    'depression_score': 10,
    'anxiety_score': 1,
    'social_support_score': 8,
    'productivity_score': 9.0
}

label, confidence = predict_mental_health(sample_input)
print(f"Predicted Risk: {label} ({confidence}%)")


Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.97      0.98       475
           1       0.97      0.96      0.97       359
           2       0.97      0.99      0.98      1166

    accuracy                           0.98      2000
   macro avg       0.98      0.97      0.98      2000
weighted avg       0.98      0.98      0.98      2000

Confusion Matrix:
 [[ 460    0   15]
 [   0  344   15]
 [   0    9 1157]]
Predicted Risk: Medium (58.0%)


In [2]:
from sklearn.preprocessing import LabelEncoder
import joblib

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Save label_encoder object (not its classes)
joblib.dump(label_encoder, 'label_encoder.pkl')


['label_encoder.pkl']