In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
from datetime import datetime

### Load and Initial Cleaning of Data

In [None]:
print("Loading and Initial Cleaning")
try:
    df = pd.read_csv('cardekho_dataset.csv')
    print("✅ 'cardekho_dataset.csv' loaded successfully!")
except FileNotFoundError:
    print("Error: Make sure 'cardekho_dataset.csv' is in the same directory.")

if 'Unnamed: 0' in df.columns:
    df = df.drop('Unnamed: 0', axis=1)

if 'model' in df.columns:
    df = df.drop('model', axis=1)
    print("👍 Redundant 'model' column removed.")

### Full Data Preparation

In [None]:
print("\nPreparing All Columns")
df.rename(columns={
    'brand': 'company', 'car_name': 'name', 'selling_price': 'price', 'km_driven':'kms_driven',
    'transmission_type': 'transmission', 'mileage': 'mileage_kmpl',
    'engine': 'engine_cc', 'max_power': 'max_power_bhp'
}, inplace=True)
df['year'] = datetime.now().year - df['vehicle_age']
df['engine_cc'] = pd.to_numeric(df['engine_cc'].astype(str).str.replace(' CC', ''), errors='coerce')
df['max_power_bhp'] = pd.to_numeric(df['max_power_bhp'].astype(str).str.replace(' bhp', ''), errors='coerce')
df['mileage_kmpl'] = pd.to_numeric(df['mileage_kmpl'].astype(str).str.split(' ').str[0], errors='coerce')
for col in ['engine_cc', 'max_power_bhp', 'mileage_kmpl', 'seats']:
    df[col].fillna(df[col].median(), inplace=True)
final_df = df[['company', 'name', 'year', 'price', 'kms_driven', 'fuel_type', 'transmission', 'seller_type', 'mileage_kmpl', 'engine_cc', 'max_power_bhp', 'seats']]
print("Data cleaning complete.")

### Feature Engineering (One-Hot Encoding)

In [None]:
print("\nPerforming One-Hot Encoding")
model_ready_df = pd.get_dummies(final_df, columns=['company', 'name', 'fuel_type', 'transmission', 'seller_type'])
print("One-Hot Encoding complete.")

### Split Data into Training and Testing Sets

In [None]:
print("\nSplitting Data")
X = model_ready_df.drop('price', axis=1)
y = model_ready_df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Data split complete")

### Train the XGBoost Model

In [None]:
print("\nTraining the Model")
print("Starting model training")
model = xgb.XGBRegressor(
    device='cpu',
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=5,
    random_state=42,
    n_jobs=-1
)
model.fit(X_train, y_train)
print("Model training complete!")

### Evaluate Model Performance

In [None]:
print("\nEvaluating Model Performance")
predictions = model.predict(X_test)
r2 = r2_score(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
print(f"R-squared (R2): {r2:.3f}")
print(f"Mean Absolute Error (MAE): ₹{mae:,.2f}")

### Save the Trained Model

In [None]:
print("\nSaving the Model")
joblib.dump(model, 'cardekho_dealer_model.joblib')
print("\nTrained dealer model has been saved as 'cardekho_dealer_model.joblib'")