<a href="https://colab.research.google.com/github/WaleedAhmed565/Ai-lab1/blob/main/new_fyp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q xgboost lightgbm catboost scikit-learn pandas joblib


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge
from sklearn.compose import TransformedTargetRegressor

import xgboost as xgb
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import joblib

RND = 42
CURRENT_YEAR = 2025
TARGET = 'price'

In [3]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:

DATA_PATH = "/content/drive/MyDrive/pakwheels_pakistan_automobile_dataset.csv" # <- change to your uploaded filename
assert Path(DATA_PATH).exists(), f"Data file not found: {DATA_PATH}"


df = pd.read_csv(DATA_PATH)
print('Loaded:', df.shape)
print(df.head())

Loaded: (48189, 16)
                 manufacturer     price       city  model  mileage fuel_type  \
0    Honda N One Premium 2014   2650000     Lahore   2014    82000    Petrol   
1            Nissan Note 2020   5400000     Lahore   2020    59000    Hybrid   
2  Suzuki Vitara GLX 1.6 2017         0    Karachi   2017    67000    Petrol   
3     Toyota Yaris Cross 2021   7850000     Lahore   2021    41000    Hybrid   
4       BMW X1 sDrive18i 2017  10700000  Islamabad   2017    37000    Petrol   

  transmission     registered   color  assembly  engine_capacity post_date  \
0    Automatic         Lahore    Blue  Imported              660  5/4/2024   
1    Automatic  Un-Registered  Silver  Imported             1200  5/4/2024   
2    Automatic        Karachi    Grey  Imported             1600  5/4/2024   
3    Automatic  Un-Registered   Beige  Imported             1500  5/4/2024   
4    Automatic      Islamabad   White  Imported             1500  5/4/2024   

  price_category price_categor

In [5]:
print(df.dtypes)
print(df.isnull().sum())


manufacturer        object
price                int64
city                object
model                int64
mileage              int64
fuel_type           object
transmission        object
registered          object
color               object
assembly            object
engine_capacity      int64
post_date           object
price_category      object
price_category.1    object
price_category.2    object
price_category.3     int64
dtype: object
manufacturer        0
price               0
city                0
model               0
mileage             0
fuel_type           0
transmission        0
registered          0
color               0
assembly            0
engine_capacity     0
post_date           0
price_category      0
price_category.1    0
price_category.2    0
price_category.3    0
dtype: int64


In [6]:
# Cell 5 — Clean Data

# Check missing values
df.isnull().sum()

# Drop rows with missing values
df = df.dropna()

# Remove unrealistic prices
df = df[df['price'] > 1000]

# --- FIX: model ko string bana dete hain ---
df['model'] = df['model'].astype(str)

# Extract Year from 'model' column (e.g., "Civic 2018")
df['year'] = df['model'].str.extract(r'(\d{4})')  # extract 4-digit year

# Convert year to numeric
df['year'] = pd.to_numeric(df['year'], errors='coerce')

# Remove rows where year couldn't be extracted
df = df.dropna(subset=['year'])

# Convert year to int
df['year'] = df['year'].astype(int)

# Remove unrealistic years
df = df[(df['year'] > 1980) & (df['year'] <= CURRENT_YEAR)]

# Feature engineering: Car age
df['car_age'] = CURRENT_YEAR - df['year']

df.head()


Unnamed: 0,manufacturer,price,city,model,mileage,fuel_type,transmission,registered,color,assembly,engine_capacity,post_date,price_category,price_category.1,price_category.2,price_category.3,year,car_age
0,Honda N One Premium 2014,2650000,Lahore,2014,82000,Petrol,Automatic,Lahore,Blue,Imported,660,5/4/2024,Medium,Medium,Saturday,10,2014,11
1,Nissan Note 2020,5400000,Lahore,2020,59000,Hybrid,Automatic,Un-Registered,Silver,Imported,1200,5/4/2024,High,Medium,Saturday,4,2020,5
3,Toyota Yaris Cross 2021,7850000,Lahore,2021,41000,Hybrid,Automatic,Un-Registered,Beige,Imported,1500,5/4/2024,High,Low,Saturday,3,2021,4
4,BMW X1 sDrive18i 2017,10700000,Islamabad,2017,37000,Petrol,Automatic,Islamabad,White,Imported,1500,5/4/2024,High,Low,Saturday,7,2017,8
5,Honda City Aspire Prosmatec 1.5 i-VTEC 2016,3600000,Karachi,2016,45000,Petrol,Automatic,Karachi,Unlisted,Local,1500,5/4/2024,Medium,Low,Saturday,8,2016,9


In [7]:
if 'price' not in df.columns:
    raise ValueError('Dataset must contain `price` column')

# Remove non-positive prices
df = df[df['price'] > 0].copy()

# Log-transform the target to stabilize variance and help models learn multiplicative depreciation
# We'll use TransformedTargetRegressor later so here's just a check for outliers before transform
q_low, q_high = df['price'].quantile(0.01), df['price'].quantile(0.99)
df = df[(df['price'] >= q_low) & (df['price'] <= q_high)]
print('After outlier trim:', df.shape)


After outlier trim: (46784, 18)


In [8]:
#Cell 7: Train/test split
TARGET = 'price'
X = df.drop(columns=[TARGET])
y = df[TARGET]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=RND)
print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)

Train shape: (37427, 17) Test shape: (9357, 17)


In [9]:
# Cell 8: Identify numerical & categorical features (customize as per your dataset)
# Keep 'year' as numeric if you want, but car_age is most important
numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
# remove any numeric that are actually identifiers
numeric_features = [c for c in numeric_features if c not in ['some_id_column_if_any']]


categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()


print('Numeric:', numeric_features)
print('Categorical:', categorical_features)

Numeric: ['mileage', 'engine_capacity', 'price_category.3', 'year', 'car_age']
Categorical: ['manufacturer', 'city', 'model', 'fuel_type', 'transmission', 'registered', 'color', 'assembly', 'post_date', 'price_category', 'price_category.1', 'price_category.2']


In [10]:
# Cell 9: Preprocessing pipelines
from sklearn.preprocessing import OneHotEncoder

# Numeric features: impute missing values and scale
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical features: impute missing values and one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # use sparse_output for sklearn >=1.2
])

# Combine numeric and categorical transformers
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
], remainder='drop')


In [16]:
# Cell 10: Model Definition
xgb_reg = xgb.XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    random_state=RND,
    n_jobs=4,
    tree_method='hist',  # CPU-safe
    verbosity=0
)

model_pipeline = Pipeline([
    ('preproc', preprocessor),
    ('regressor', TransformedTargetRegressor(
        regressor=xgb_reg,
        func=np.log,
        inverse_func=np.exp
    ))
])

In [17]:
print("Training started... this may take several minutes depending on dataset size.")
model_pipeline.fit(X_train, y_train)
print("Training completed.")


Training started... this may take several minutes depending on dataset size.
Training completed.


In [18]:
from sklearn.metrics import mean_squared_error, r2_score

y_train_pred = model_pipeline.predict(X_train)
y_test_pred = model_pipeline.predict(X_test)

mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)

r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f"Train RMSE: {np.sqrt(mse_train):.2f}, R²: {r2_train:.3f}")
print(f"Test RMSE: {np.sqrt(mse_test):.2f}, R²: {r2_test:.3f}")


Train RMSE: 1124546.84, R²: 0.905
Test RMSE: 1097128.43, R²: 0.898


In [19]:
import joblib
ARTIFACT_DIR = Path('artifacts')
ARTIFACT_DIR.mkdir(exist_ok=True)
joblib.dump(model_pipeline, ARTIFACT_DIR / 'car_price_model.joblib')
print("Model saved to artifacts/car_price_model.joblib")


Model saved to artifacts/car_price_model.joblib
