In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import pickle

# Load dataset
df = pd.read_csv("dataset(3).csv")

# Encode categorical variables
categorical_cols = ['Gender', 'EducationLevel']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# Drop unwanted or non-numeric columns
df.drop(columns=['TimeLeftHome', 'DoctorInCharge'], errors='ignore', inplace=True)

# Drop rows with missing target
df.dropna(subset=['TimeLeftHome_Minutes'], inplace=True)

# Features and target
X = df.drop(columns=['TimeLeftHome_Minutes'])
y = df['TimeLeftHome_Minutes']

# Fill remaining missing values
X = X.fillna(X.median(numeric_only=True))

# Normalize numerical features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# Train Random Forest
model = RandomForestRegressor(n_estimators=200, max_depth=12, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"MAE: {mae:.2f}, R²: {r2:.4f}")

# Save model, scaler, encoders, and feature columns
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

with open('encoder.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

feature_columns = X.columns.tolist()
with open('features.pkl', 'wb') as f:
    pickle.dump(feature_columns, f)

print("✅ Model, scaler, encoders, and feature columns saved.")


MAE: 98.73, R²: 0.7304
✅ Model, scaler, encoders, and feature columns saved.
