In [7]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import pickle

# Load your final dataset
df = pd.read_csv("dataset_1400.csv")
df.dropna(inplace=True)

# Define features
categorical = ['Brand', 'Model', 'Fuel_Type', 'Transmission']
numerical = [col for col in df.columns if col not in categorical + ['Resale_Value']]
X = df.drop("Resale_Value", axis=1)
y = df["Resale_Value"]

# Preprocessing
preprocessor = ColumnTransformer([
    ("onehot", OneHotEncoder(handle_unknown="ignore"), categorical),
    ("scale", StandardScaler(), numerical)
])

# Define K-Fold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

best_r2 = -1
best_pipeline = None

# Train over folds
for fold, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("model", RandomForestRegressor(n_estimators=200, random_state=42))
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    print(f"📁 Fold {fold} — R²: {r2:.4f} | MAE: ₹{int(mae)} | RMSE: ₹{int(rmse)}")

    if r2 > best_r2:
        best_r2 = r2
        best_pipeline = pipeline

# ✅ Save the best pipeline
with open("random_forest_final.pkl", "wb") as f:
    pickle.dump(best_pipeline, f)

print("✅ Best model pickled as 'random_forest_final.pkl'")


📁 Fold 1 — R²: 0.9559 | MAE: ₹50131 | RMSE: ₹110824
📁 Fold 2 — R²: 0.9538 | MAE: ₹48336 | RMSE: ₹103501
📁 Fold 3 — R²: 0.9350 | MAE: ₹62778 | RMSE: ₹145333
📁 Fold 4 — R²: 0.9260 | MAE: ₹51363 | RMSE: ₹123309
📁 Fold 5 — R²: 0.9414 | MAE: ₹62423 | RMSE: ₹127477
✅ Best model pickled as 'random_forest_final.pkl'


In [8]:
# Save final feature names for Streamlit chart
final_feature_names = best_pipeline.named_steps["preprocessor"].get_feature_names_out()
import pickle
with open("feature_names.pkl", "wb") as f:
    pickle.dump(final_feature_names, f)


In [9]:
# Save categories seen by OneHotEncoder
encoder = pipeline.named_steps['preprocessor'].named_transformers_['onehot']
with open("category_levels.pkl", "wb") as f:
    pickle.dump(encoder.categories_, f)