In [None]:
#  Step 1: Import Libraries
import pandas as pd
import numpy as np
import os                         
import joblib
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score



#  Step 2: Load Dataset
file_path = '../data/CAR DETAILS FROM CAR DEKHO.csv'
df = pd.read_csv(file_path)
df.head()


In [None]:
# Step 3: Data Overview
df.info()
df.describe()
df.isnull().sum()
df.duplicated().sum()


In [None]:
#  Step 4: Data Cleaning
df = df.drop_duplicates()
df = df.dropna()

# Standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# Rename if typos exist
if 'seeler_type' in df.columns:
    df.rename(columns={'seeler_type': 'seller_type'}, inplace=True)
if 'owner' in df.columns:
    df.rename(columns={'owner': 'owners'}, inplace=True)


In [None]:
#  Step 5: Feature Engineering
df['car_age'] = 2025 - df['year']
df.drop(['year', 'name'], axis=1, inplace=True)

#  Map string labels in 'owners' column to integers
owner_mapping = {
    'Test Drive Car': 0,
    'First Owner': 1,
    'Second Owner': 2,
    'Third Owner': 3,
    'Fourth & Above Owner': 4
}
df['owners'] = df['owners'].map(owner_mapping)

# 🧹 Drop rows with unexpected/unmapped owner values
df = df[df['owners'].notnull()]
df['owners'] = df['owners'].astype(int)


In [None]:
#  Step 6: Encode Categorical Variables
df = pd.get_dummies(df, columns=['fuel', 'seller_type', 'transmission'], drop_first=True)
df.head()


In [None]:
#  Step 7: Feature & Label Split
X = df.drop('selling_price', axis=1)
y = df['selling_price']

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
#  Step 8: Train Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict and Evaluate
y_pred = model.predict(X_test)
print(f"R2 Score: {r2_score(y_test, y_pred):.4f}")


In [None]:
#  Step 9: Save Model and Columns
os.makedirs('../model', exist_ok=True)


joblib.dump(model, '../model/car_price_model2.pkl')

with open('../model/columns.pkl', 'wb') as f:
    pickle.dump(X.columns.tolist(), f)

print("✅ Model and columns saved successfully.")


NameError: name 'model' is not defined

In [1]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import os
import joblib
import pickle

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor  # Changed to RF
from sklearn.metrics import r2_score

# Step 2: Load Dataset
file_path = '../data/CAR DETAILS FROM CAR DEKHO.csv'
df = pd.read_csv(file_path)

# Step 3: Data Cleaning & Standardization
df = df.drop_duplicates()
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# Step 4: Feature Engineering
# Extract brand from name
df['brand'] = df['name'].apply(lambda x: x.split()[0])
df['car_age'] = 2025 - df['year']

# Log transform skewed variables
df['km_driven'] = np.log1p(df['km_driven'])
df['selling_price'] = np.log1p(df['selling_price'])

# Clean up columns
df.drop(['year', 'name'], axis=1, inplace=True)

# Owner mapping
owner_mapping = {
    'Test Drive Car': 0,
    'First Owner': 1,
    'Second Owner': 2,
    'Third Owner': 3,
    'Fourth & Above Owner': 4
}
df['owner'] = df['owner'].map(owner_mapping)
df = df[df['owner'].notnull()]
df['owner'] = df['owner'].astype(int)

# Step 5: One-Hot Encoding (with drop_first to avoid multicollinearity)
df = pd.get_dummies(df, columns=['fuel', 'seller_type', 'transmission', 'brand'], drop_first=True)

# Step 6: Feature-Label Split
X = df.drop('selling_price', axis=1)
y = df['selling_price']

# Step 7: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 8: Train Model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 9: Evaluate Model
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"✅ R2 Score: {r2:.4f} ({r2*100:.2f}%)")

# Step 10: Save Model and Columns
os.makedirs('../model', exist_ok=True)
joblib.dump(model, '../model/car_price_model2.pkl')

with open('../model/columns.pkl', 'wb') as f:
    pickle.dump(X.columns.tolist(), f)

print("✅ Model and columns saved successfully.")



✅ R2 Score: 0.7598 (75.98%)
✅ Model and columns saved successfully.


In [2]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import os
import joblib
import pickle
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import r2_score
from xgboost import XGBRegressor

# Step 2: Load Dataset
file_path = '../data/CAR DETAILS FROM CAR DEKHO.csv'
df = pd.read_csv(file_path)

# Step 3: Data Cleaning
df.drop_duplicates(inplace=True)
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# Step 4: Feature Engineering
df['brand'] = df['name'].apply(lambda x: x.split()[0])
df['car_age'] = 2025 - df['year']

# Remove outliers in price
q1 = df['selling_price'].quantile(0.25)
q3 = df['selling_price'].quantile(0.75)
iqr = q3 - q1
df = df[(df['selling_price'] >= q1 - 1.5 * iqr) & (df['selling_price'] <= q3 + 1.5 * iqr)]

# Log transform target and driven km
df['selling_price'] = np.log1p(df['selling_price'])
df['km_driven'] = np.log1p(df['km_driven'])

# Drop irrelevant
df.drop(['name', 'year'], axis=1, inplace=True)

# Owner mapping
owner_map = {
    'Test Drive Car': 0,
    'First Owner': 1,
    'Second Owner': 2,
    'Third Owner': 3,
    'Fourth & Above Owner': 4
}
df['owner'] = df['owner'].map(owner_map)
df = df[df['owner'].notnull()]
df['owner'] = df['owner'].astype(int)

# Group rare brands
brand_counts = df['brand'].value_counts()
rare_brands = brand_counts[brand_counts < 10].index
df['brand'] = df['brand'].apply(lambda x: 'Other' if x in rare_brands else x)

# New interaction features
df['price_per_km'] = np.expm1(df['selling_price']) / np.expm1(df['km_driven'])
df['age_km'] = df['car_age'] * df['km_driven']

# One-hot encoding
df = pd.get_dummies(df, columns=['fuel', 'seller_type', 'transmission', 'brand'], drop_first=True)

# Step 5: Feature-Label Split
X = df.drop('selling_price', axis=1)
y = df['selling_price']

# Step 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Hyperparameter Tuning (Randomized Search)
xgb = XGBRegressor(random_state=42)
param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 1.0]
}

search = RandomizedSearchCV(xgb, param_grid, n_iter=15, cv=3, scoring='r2', random_state=42, verbose=0)
search.fit(X_train, y_train)
best_model = search.best_estimator_

# Step 8: Evaluate Model
y_pred = best_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"\n✅ Final R² Score: {r2:.4f} ({r2*100:.2f}%)")
print("🎯 Best Hyperparameters:", search.best_params_)

# Step 9: Save Model
os.makedirs('../model', exist_ok=True)
joblib.dump(best_model, '../model/car_price_model_optimized.pkl')

with open('../model/columns.pkl', 'wb') as f:
    pickle.dump(X.columns.tolist(), f)

print("\n✅ Optimized model and feature columns saved successfully.")



✅ Final R² Score: 0.9882 (98.82%)
🎯 Best Hyperparameters: {'subsample': 0.8, 'n_estimators': 500, 'max_depth': 5, 'learning_rate': 0.1}

✅ Optimized model and feature columns saved successfully.
