# Data loading + Cleaning + Labeling


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import joblib
import os

# Load CSV
df = pd.read_csv('survey.csv')

# Clean column names
df.columns = df.columns.str.strip().str.lower()

# Select features and target
features = [
    'age', 'gender', 'family_history', 'benefits',
    'care_options', 'anonymity', 'leave', 'work_interfere'
]

df = df[features + ['treatment']]

# Clean gender
df['gender'] = df['gender'].str.lower()
df['gender'] = df['gender'].replace(['male', 'm', 'man'], 'male')
df['gender'] = df['gender'].replace(['female', 'f', 'woman'], 'female')
df['gender'] = df['gender'].where(df['gender'].isin(['male', 'female']), 'other')

# Drop missing values
df.dropna(inplace=True)

# Encode all object columns
label_encoders = {}
for col in df.columns:
    if df[col].dtype == 'object':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

# Save encoders
os.makedirs('encoders', exist_ok=True)
for col, le in label_encoders.items():
    joblib.dump(le, f'encoders/{col}_encoder.pkl')

# Split data
X = df[features]
y = df['treatment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Save model and features
joblib.dump(model, 'model.pkl')
joblib.dump(features, 'features.pkl')

['features.pkl']