In [2]:
# Car Price Prediction Project by __ SHanmugapandi
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# Step 2: Load dataset
df = pd.read_csv("cardekho_dataset.csv")
print("Dataset Shape:", df.shape)
print(df.head())

# Step 3: (Optional) Check missing values
print("\nMissing values:\n", df.isnull().sum())

# Step 4: Handling outliers (removing extreme values)
q_price = df["selling_price"].quantile(0.99)
df = df[df["selling_price"] < q_price]

q_km = df["km_driven"].quantile(0.99)
df = df[df["km_driven"] < q_km]

# Step 5: Feature Engineering
df["power_per_engine"] = df["max_power"] / df["engine"]
df["mileage_engine_factor"] = df["mileage"] * df["engine"]

# Step 6: Converting categorical values into numeric (one hot encoding)
df = pd.get_dummies(df, columns=["seller_type", "fuel_type", "transmission_type"], drop_first=True)

# FIX: Remove useless columns and encode brand/model
df = df.drop(columns=['Unnamed: 0', 'car_name'])
df = pd.get_dummies(df, columns=['brand', 'model'], drop_first=True)

# Step 7: Features and target
X = df.drop("selling_price", axis=1)
y = np.log(df["selling_price"])   # log transformation for stability

# Step 8: Train Test Split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 9: Try Linear Regression (baseline)
lr = LinearRegression()
lr.fit(x_train, y_train)
y_pred_lr = lr.predict(x_test)
print("Linear Regression R2:", r2_score(y_test, y_pred_lr))

# Step 10: Try Random Forest
rf_model = RandomForestRegressor(n_estimators=300, max_depth=15, random_state=42)
rf_model.fit(x_train, y_train)
y_pred_rf = rf_model.predict(x_test)
print("Random Forest R2:", r2_score(y_test, y_pred_rf))

# Step 11: Try XGBoost (usually best)
xgb_model = XGBRegressor(n_estimators=500, learning_rate=0.1, max_depth=8, random_state=42)
xgb_model.fit(x_train, y_train)
y_pred_xgb = xgb_model.predict(x_test)
print("XGBoost R2:", r2_score(y_test, y_pred_xgb))

# Step 12: Evaluation (check RMSE also)
print("Random Forest RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_rf)))
print("XGBoost RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_xgb)))

# Step 13: Choose best model (most likely XGBoost)
# Converting predictions back from log scale
final_pred = np.exp(y_pred_xgb)
actual = np.exp(y_test)

# Step 14: Visualization
plt.figure(figsize=(6,6))
plt.scatter(actual, final_pred, alpha=0.5, color="blue")
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual vs Predicted Car Price")
plt.show()

sns.histplot(df["selling_price"], bins=40, kde=True)
plt.title("Selling Price Distribution")
plt.show()
# Step 15: Save best model (XGBoost in this case)
import pickle
import joblib

# Save with pickle (optional, just one way)
with open("best_model.pkl", "wb") as f:
    pickle.dump(xgb_model, f)

# Save with joblib (recommended for sklearn/xgboost)
joblib.dump(xgb_model, "model.pkl")

# Save the training feature names (important for prediction alignment later)
joblib.dump(X.columns.tolist(), "feature_names.pkl")

# Step 15: Save best model
import pickle
pickle.dump(xgb_model, open("best_model.pkl", "wb"))
print("Best model saved as best_model.pkl")


ModuleNotFoundError: No module named 'xgboost'