In [7]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor

In [8]:
file_path = "pakwheels_cars_final.csv"

In [9]:
data = pd.read_csv(file_path)
df = pd.DataFrame(data)
df.head(10)

Unnamed: 0,title,price,year,mileage,fuel_type,engine_capacity,transmission,link
0,Nissan Clipper 2022 for sale in Karachi,"PKR 2,625,000",2022.0,"101,000 km",Petrol,660 cc,Automatic,https://www.pakwheels.com/used-cars/nissan-cli...
1,KIA Sportage 2023 for sale in Lahore,"PKR 6,295,000",2023.0,"77,000 km",Petrol,2000 cc,Automatic,https://www.pakwheels.com/used-cars/kia-sporta...
2,Smart Smart Fortwo 2007 for sale in Lahore,"PKR 2,700,000",2007.0,"53,829 km",Petrol,1000 cc,Automatic,https://www.pakwheels.com/used-cars/smart-smar...
3,Toyota Passo 2014 for sale in Karachi,"PKR 2,685,000",2014.0,"83,000 km",Petrol,1000 cc,Automatic,https://www.pakwheels.com/used-cars/toyota-pas...
4,Toyota Corolla 2016 for sale in Karachi,"PKR 3,425,000",2016.0,"69,000 km",Petrol,1300 cc,Manual,https://www.pakwheels.com/used-cars/toyota-cor...
5,Suzuki Wagon R 2017 for sale in Islamabad,"PKR 2,130,000",2017.0,"83,500 km",Petrol,1000 cc,Manual,https://www.pakwheels.com/used-cars/suzuki-wag...
6,Daihatsu Rocky 2020 for sale in Lahore,"PKR 5,970,000",2020.0,"24,000 km",Petrol,1000 cc,Automatic,https://www.pakwheels.com/used-cars/daihatsu-r...
7,Toyota Raize 2020 for sale in Lahore,"PKR 5,970,000",2020.0,"25,000 km",Petrol,1000 cc,Automatic,https://www.pakwheels.com/used-cars/toyota-rai...
8,MG HS 2020 for sale in Rawalpindi,"PKR 6,200,000",2020.0,"57,581 km",Petrol,1500 cc,Automatic,https://www.pakwheels.com/used-cars/mg-hs-2020...
9,Toyota Passo 2021 for sale in Karachi,"PKR 4,075,000",2021.0,"18,700 km",Petrol,1000 cc,Automatic,https://www.pakwheels.com/used-cars/toyota-pas...


In [10]:
df.shape
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59668 entries, 0 to 59667
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            58476 non-null  object 
 1   price            58476 non-null  object 
 2   year             58476 non-null  float64
 3   mileage          59312 non-null  object 
 4   fuel_type        59312 non-null  object 
 5   engine_capacity  59312 non-null  object 
 6   transmission     59312 non-null  object 
 7   link             58476 non-null  object 
dtypes: float64(1), object(7)
memory usage: 3.6+ MB


Unnamed: 0,year
count,58476.0
mean,2014.103
std,9.02643
min,1900.0
25%,2009.0
50%,2017.0
75%,2021.0
max,2026.0


In [11]:
df = df.dropna(subset=['price'])
df = df[df['fuel_type'] != "Electric"]

In [12]:
df.shape

(57703, 8)

In [13]:
df['price'] = (
    df['price']
    .str.replace('PKR', "", regex=False)
    .str.replace(',', "", regex=False)
    .str.strip()
    .astype(float)
)
df['price'] = pd.to_numeric(df['price'], errors='coerce')

df["mileage"] = (
    df["mileage"]
    .str.replace("km", "", regex=False)
    .str.replace(",", "", regex=False)
    .str.strip()
    .astype(float)
                 )
df["mileage"] = pd.to_numeric(df["mileage"], errors="coerce")

df = df[~df['engine_capacity'].astype(str).str.contains("kWh", case=False, na=False)]
df["engine_capacity"] = (
    df["engine_capacity"]
    .str.replace("cc", "", regex=False)
    .str.replace(",", "", regex=False)
    .str.strip()
    .astype(float)
)
df["engine_capacity"] = pd.to_numeric(df["engine_capacity"], errors="coerce")

In [14]:
df.drop("link", axis=1, inplace=True)

In [15]:
df.head()

Unnamed: 0,title,price,year,mileage,fuel_type,engine_capacity,transmission
0,Nissan Clipper 2022 for sale in Karachi,2625000.0,2022.0,101000.0,Petrol,660.0,Automatic
1,KIA Sportage 2023 for sale in Lahore,6295000.0,2023.0,77000.0,Petrol,2000.0,Automatic
2,Smart Smart Fortwo 2007 for sale in Lahore,2700000.0,2007.0,53829.0,Petrol,1000.0,Automatic
3,Toyota Passo 2014 for sale in Karachi,2685000.0,2014.0,83000.0,Petrol,1000.0,Automatic
4,Toyota Corolla 2016 for sale in Karachi,3425000.0,2016.0,69000.0,Petrol,1300.0,Manual


In [16]:
df["title"] = (
    df["title"]
    .str.strip()
)
df["city"] = (
    df["title"]
    .str.extract(r"for sale in\s+(.+)$", expand=False)
)
df["title"] = (
    df["title"]
    .str.replace(r"\sfor sale in\s+.+$", "", regex=True)
    .str.strip()
)

df["title"] = (
    df["title"]
    .str.replace(r"\b(19|20)\d{2}\b", "", regex=True)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)

In [17]:
df.head(10)

Unnamed: 0,title,price,year,mileage,fuel_type,engine_capacity,transmission,city
0,Nissan Clipper,2625000.0,2022.0,101000.0,Petrol,660.0,Automatic,Karachi
1,KIA Sportage,6295000.0,2023.0,77000.0,Petrol,2000.0,Automatic,Lahore
2,Smart Smart Fortwo,2700000.0,2007.0,53829.0,Petrol,1000.0,Automatic,Lahore
3,Toyota Passo,2685000.0,2014.0,83000.0,Petrol,1000.0,Automatic,Karachi
4,Toyota Corolla,3425000.0,2016.0,69000.0,Petrol,1300.0,Manual,Karachi
5,Suzuki Wagon R,2130000.0,2017.0,83500.0,Petrol,1000.0,Manual,Islamabad
6,Daihatsu Rocky,5970000.0,2020.0,24000.0,Petrol,1000.0,Automatic,Lahore
7,Toyota Raize,5970000.0,2020.0,25000.0,Petrol,1000.0,Automatic,Lahore
8,MG HS,6200000.0,2020.0,57581.0,Petrol,1500.0,Automatic,Rawalpindi
9,Toyota Passo,4075000.0,2021.0,18700.0,Petrol,1000.0,Automatic,Karachi


In [18]:
X = df[[
    "title",
    "year",
    "mileage",
    "engine_capacity",
    "city",
    "transmission",
    "fuel_type",
]]

y = df["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
cat_col = [
    "title",
    "fuel_type",
    "city",
    "transmission"
]
num_col = ["mileage", "engine_capacity", "year"]

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(), num_col),
        ("num", "passthrough", num_col),
    ]
)

In [27]:
model = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    tree_method='hist',
    device='cuda:0'

)
pipe = Pipeline(steps=[
    ("prep", preprocess),
    ("model", model)
])
pipe.fit(X_train, y_train)
print("Your model is ready!")

Your model is ready!


In [28]:
name = "Honda Civic Oriel Prosmatec UG"
city ="Punjab"
mileage =float(96000)
engine_capacity=float(1800)
year = float(2016)
fule = "Petrol"
transmission ="Automatic"

car = pd.DataFrame([{
    "title": name,
    "year": year,
    "mileage": mileage,
    "engine_capacity": engine_capacity,
    "city": city,
    "transmission": transmission,
    "fuel_type": fule,
}])
pred_price = pipe.predict(car)[0]
print(pred_price)

4970062.5
