In [3]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_squared_error

# ---------- CONFIG ----------
CSV_PATH = "CarDekho_Dataset1.csv"   # change if needed
TARGET = "selling_price"             # set your target column
OUT_FILE = "best_model.pkl"
# -----------------------------

# Load & basic clean
df = pd.read_csv(CSV_PATH)
df = df.drop_duplicates()
df = df.dropna(subset=[TARGET])      # drop rows missing target
df=df.drop('name',axis=1)
X = df.drop(columns=[TARGET])
y = df[TARGET]

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Column types
num_cols = [c for c in X.columns if np.issubdtype(X[c].dtype, np.number)]
cat_cols = [c for c in X.columns if c not in num_cols]

# Preprocessor
preprocess = ColumnTransformer([
    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler())
    ]), num_cols),
    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]), cat_cols)
])

# Models
models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(n_estimators=200, random_state=42),
    "SVR": SVR()
}

# Train & evaluate
best_model, best_score = None, -999
for name, model in models.items():
    pipe = Pipeline([("prep", preprocess), ("model", model)])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    r2 = r2_score(y_test, preds)
    rmse = mean_squared_error(y_test, preds) ** 0.5   # fixed here
    print(f"{name}: R2={r2:.3f}, RMSE={rmse:.2f}")
    if r2 > best_score:
        best_model, best_score = pipe, r2

# Save best
with open(OUT_FILE, "wb") as f:
    pickle.dump(best_model, f)

print(f"✅ Saved best model to {OUT_FILE}")

LinearRegression: R2=0.790, RMSE=18466.31
RandomForest: R2=0.936, RMSE=10195.15
SVR: R2=-0.052, RMSE=41371.70
✅ Saved best model to best_model.pkl
