In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.utils import resample
import joblib

# Step 1: Load Dataset
df = pd.read_csv("dataset-pp.csv")

# Step 2: Balance the Dataset
high = df[df['MentalHealthRisk'] == 'High']
moderate = df[df['MentalHealthRisk'] == 'Moderate']
low = df[df['MentalHealthRisk'] == 'Low']

low_upsampled = resample(low, replace=True, n_samples=108, random_state=42)
moderate_upsampled = resample(moderate, replace=True, n_samples=108, random_state=42)

df_balanced = pd.concat([high, moderate_upsampled, low_upsampled])
df_balanced = df_balanced.sample(frac=1).reset_index(drop=True)

# Step 3: Encode Categorical Features
gender_encoder = LabelEncoder()
occupation_encoder = LabelEncoder()
location_encoder = LabelEncoder()
stress_encoder = LabelEncoder()

df_balanced['Gender'] = gender_encoder.fit_transform(df_balanced['Gender'])
df_balanced['Occupation'] = occupation_encoder.fit_transform(df_balanced['Occupation'])
df_balanced['Location'] = location_encoder.fit_transform(df_balanced['Location'])
df_balanced['StressLevel'] = stress_encoder.fit_transform(df_balanced['StressLevel'])

# Step 4: Prepare Data
features = ['Age', 'Gender', 'Occupation', 'Location', 'Sleep_Hours',
            'Screen_Time', 'Physical_Activity', 'Stress_Level',
            'PHQ_Score', 'GAD_Score']
X = df_balanced[features]
y = df_balanced['MentalHealthRisk']

# Normalize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 5: Split & Train
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Step 6: Evaluate
y_pred = model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Step 7: Save Model + Encoders + Scaler
joblib.dump(model, "model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(gender_encoder, "gender_encoder.pkl")
joblib.dump(occupation_encoder, "occupation_encoder.pkl")
joblib.dump(location_encoder, "location_encoder.pkl")
joblib.dump(stress_encoder, "stress_encoder.pkl")

print("✅ Model and preprocessing tools saved successfully!")


KeyError: "['Sleep_Hours', 'Screen_Time', 'Physical_Activity', 'Stress_Level', 'PHQ_Score', 'GAD_Score'] not in index"

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
import joblib

# Load dataset
df = pd.read_csv("dataset-pp.csv")

# Drop 'Name' column (not used for prediction)
df = df.drop(columns=["Name"])

# Separate features and target
X = df.drop("MentalHealthRisk", axis=1)
y = df["MentalHealthRisk"]

# Encode categorical columns
gender_encoder = LabelEncoder()
occupation_encoder = LabelEncoder()
location_encoder = LabelEncoder()
stress_encoder = LabelEncoder()
target_encoder = LabelEncoder()

X["Gender"] = gender_encoder.fit_transform(X["Gender"])
X["Occupation"] = occupation_encoder.fit_transform(X["Occupation"])
X["Location"] = location_encoder.fit_transform(X["Location"])
X["StressLevel"] = stress_encoder.fit_transform(X["StressLevel"])
y_encoded = target_encoder.fit_transform(y)

# Save target encoder in case you need to decode predictions
joblib.dump(target_encoder, "target_encoder.pkl")

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save model and encoders
joblib.dump(model, "model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(gender_encoder, "gender_encoder.pkl")
joblib.dump(occupation_encoder, "occupation_encoder.pkl")
joblib.dump(location_encoder, "location_encoder.pkl")
joblib.dump(stress_encoder, "stress_encoder.pkl")

print("Model training complete and files saved.")


Model training complete and files saved.


In [1]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Predict on test data
y_pred = model.predict(X_test)

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion matrix
conf_mat = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", conf_mat)

# Visualize confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues', xticklabels=model.classes_, yticklabels=model.classes_)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

# Optional regression-style metrics (only if labels are numeric)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_test_encoded = le.fit_transform(y_test)
y_pred_encoded = le.transform(y_pred)

print("\nMean Squared Error:", mean_squared_error(y_test_encoded, y_pred_encoded))
print("Mean Absolute Error:", mean_absolute_error(y_test_encoded, y_pred_encoded))
print("R² Score:", r2_score(y_test_encoded, y_pred_encoded))


NameError: name 'model' is not defined