In [None]:
#import libraries
import joblib
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [None]:
# Load dataset
file_path = '/content/survey.csv'  # Update this path if needed
df = pd.read_csv(file_path)

In [None]:
# Select relevant columns
columns_to_keep = ["Age", "Gender", "family_history", "work_interfere", "no_employees",
                    "leave", "mental_health_consequence", "phys_health_consequence",
                    "supervisor", "mental_vs_physical", "obs_consequence", "comments", "treatment"]
df = df[columns_to_keep]

In [None]:
# Clean gender column
def clean_gender(gender):
    gender = gender.lower()
    if gender in ['male', 'm']: return 'Male'
    elif gender in ['female', 'f']: return 'Female'
    else: return 'Other'


In [None]:
# Apply the function to clean gender column
df['Gender'] = df['Gender'].astype(str).apply(clean_gender)

# Fill missing values with "Unknown" where applicable
df.fillna("Unknown", inplace=True)

# Remove rows with missing values in the target variable ('treatment')
df = df.dropna(subset=['treatment'])

In [None]:
# Encode categorical variables
label_cols = ["Gender", "family_history", "work_interfere", "no_employees", "leave",
              "mental_health_consequence", "phys_health_consequence", "supervisor",
              "mental_vs_physical", "obs_consequence"]
encoder = LabelEncoder()
for col in label_cols:
    df[col] = encoder.fit_transform(df[col])

In [None]:
# Convert target variable "treatment" to binary
df['treatment'] = df['treatment'].map({'Yes': 1, 'No': 0})

In [None]:
# Feature and Target Split
X = df.drop(columns=['treatment', 'comments'])
y = df['treatment']

In [None]:
# Standardizing numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Split data into training 80% and testing 20%
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
# Training models
models = {
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "KNN": KNeighborsClassifier(),
    "LogisticRegression": LogisticRegression()
}

In [None]:
best_model_name = None  # Store the best model's name
best_accuracy = 0  # Store the highest accuracy found
best_model = None  # Store the actual best model object

In [None]:
# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)  # Train the model
    y_pred = model.predict(X_test)  # Make predictions
    accuracy = accuracy_score(y_test, y_pred)  # Calculate accuracy score
    print(f"{name} Accuracy: {accuracy:.2f}")  # Print accuracy for this model

    # Check if this model is the best-performing one
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model
        best_model_name = name

RandomForest Accuracy: 0.80
KNN Accuracy: 0.69
LogisticRegression Accuracy: 0.69


In [None]:
# Save only the best-performing model for future use
joblib.dump(best_model, "best_mental_health_model.joblib")

['best_mental_health_model.joblib']

In [None]:
# Save the scaler to preprocess new user input in the backend
joblib.dump(scaler, "mental_health_scaler.joblib")


['mental_health_scaler.joblib']

In [None]:
# Save best model information to a log file for reference
with open("best_model_performance.txt", "w") as f:
    f.write(f"Best Model: {best_model_name}\nAccuracy: {best_accuracy:.2f}\n")

In [None]:
# Print final confirmation
print(f"✅ Best model ({best_model_name}) saved successfully with accuracy: {best_accuracy:.2f}")

✅ Best model (RandomForest) saved successfully with accuracy: 0.80
