In [None]:
# 📌 Step 1: Import Libraries
import pandas as pd
import numpy as np
import os                          # ✅ Add this line
import joblib
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score



# 📌 Step 2: Load Dataset
file_path = '../data/CAR DETAILS FROM CAR DEKHO.csv'
df = pd.read_csv(file_path)
df.head()


In [None]:
# 📌 Step 3: Data Overview
df.info()
df.describe()
df.isnull().sum()
df.duplicated().sum()


In [None]:
# 📌 Step 4: Data Cleaning
df = df.drop_duplicates()
df = df.dropna()

# Standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# Rename if typos exist
if 'seeler_type' in df.columns:
    df.rename(columns={'seeler_type': 'seller_type'}, inplace=True)
if 'owner' in df.columns:
    df.rename(columns={'owner': 'owners'}, inplace=True)


In [None]:
# 📌 Step 5: Feature Engineering
df['car_age'] = 2025 - df['year']
df.drop(['year', 'name'], axis=1, inplace=True)
df['owners'] = df['owners'].astype(int)


In [None]:
# 📌 Step 6: Encode Categorical Variables
df = pd.get_dummies(df, columns=['fuel', 'seller_type', 'transmission'], drop_first=True)
df.head()


In [None]:
# 📌 Step 7: Feature & Label Split
X = df.drop('selling_price', axis=1)
y = df['selling_price']

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# 📌 Step 8: Train Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict and Evaluate
y_pred = model.predict(X_test)
print(f"R2 Score: {r2_score(y_test, y_pred):.4f}")


In [None]:
# 📌 Step 9: Save Model and Columns
os.makedirs('../model', exist_ok=True)


joblib.dump(model, '../model/car_price_model2.pkl')

with open('../model/columns.pkl', 'wb') as f:
    pickle.dump(X.columns.tolist(), f)

print("✅ Model and columns saved successfully.")


NameError: name 'os' is not defined