# Data loading + Cleaning + Labeling


In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("survey.csv")

# Keep only needed columns
df = df[["Age", "Gender", "self_employed", "family_history", "work_interfere",
         "no_employees", "remote_work", "tech_company", "benefits",
         "care_options", "wellness_program", "seek_help", "anonymity", "leave",
         "mental_health_consequence", "phys_health_consequence",
         "coworkers", "supervisor", "mental_health_interview",
         "phys_health_interview", "mental_vs_physical", "obs_consequence", 
         "treatment"]]

# Drop missing
df = df.dropna()

# Binary encode target
df["treatment"] = df["treatment"].map({"Yes": 1, "No": 0})

In [2]:
from sklearn.preprocessing import LabelEncoder

# Encode all categorical columns
le = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col])

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

# Split data
X = df.drop("treatment", axis=1)
y = df["treatment"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Accuracy
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.8163265306122449


In [4]:
# Save model
joblib.dump(model, "model.pkl")

# Save feature columns for later use in Streamlit
joblib.dump(X.columns.tolist(), "features.pkl")

['features.pkl']