## Car Price Prediction data Pre_processing 


In [39]:
# Importing the libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

In [40]:
# Loading the data
df = pd.read_csv('./DVDataset/vehicles.csv')

In [41]:
df.head()

Unnamed: 0,id,url,region,region_url,price,year,manufacturer,model,condition,cylinders,...,size,type,paint_color,image_url,description,county,state,lat,long,posting_date
0,7222695916,https://prescott.craigslist.org/cto/d/prescott...,prescott,https://prescott.craigslist.org,6000,,,,,,...,,,,,,,az,,,
1,7218891961,https://fayar.craigslist.org/ctd/d/bentonville...,fayetteville,https://fayar.craigslist.org,11900,,,,,,...,,,,,,,ar,,,
2,7221797935,https://keys.craigslist.org/cto/d/summerland-k...,florida keys,https://keys.craigslist.org,21000,,,,,,...,,,,,,,fl,,,
3,7222270760,https://worcester.craigslist.org/cto/d/west-br...,worcester / central MA,https://worcester.craigslist.org,1500,,,,,,...,,,,,,,ma,,,
4,7210384030,https://greensboro.craigslist.org/cto/d/trinit...,greensboro,https://greensboro.craigslist.org,4900,,,,,,...,,,,,,,nc,,,


In [42]:
# checking the data info
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 426880 entries, 0 to 426879
Data columns (total 26 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            426880 non-null  int64  
 1   url           426880 non-null  object 
 2   region        426880 non-null  object 
 3   region_url    426880 non-null  object 
 4   price         426880 non-null  int64  
 5   year          425675 non-null  float64
 6   manufacturer  409234 non-null  object 
 7   model         421603 non-null  object 
 8   condition     252776 non-null  object 
 9   cylinders     249202 non-null  object 
 10  fuel          423867 non-null  object 
 11  odometer      422480 non-null  float64
 12  title_status  418638 non-null  object 
 13  transmission  424324 non-null  object 
 14  VIN           265838 non-null  object 
 15  drive         296313 non-null  object 
 16  size          120519 non-null  object 
 17  type          334022 non-null  object 
 18  pain

### Drop Unnecessary columns

In [43]:
columns_to_drop = ['id','url','region','region_url','size','image_url','description','county','VIN','lat', 'long','posting_date']
df1 = df.drop(columns=[col for col in columns_to_drop if col in df.columns])


### Drop duplicates

In [44]:
df1 = df1.drop_duplicates()

In [45]:
df1.info()


<class 'pandas.core.frame.DataFrame'>
Index: 299026 entries, 0 to 426879
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   price         299026 non-null  int64  
 1   year          298157 non-null  float64
 2   manufacturer  285966 non-null  object 
 3   model         294777 non-null  object 
 4   condition     185456 non-null  object 
 5   cylinders     182867 non-null  object 
 6   fuel          297145 non-null  object 
 7   odometer      296330 non-null  float64
 8   title_status  294322 non-null  object 
 9   transmission  297423 non-null  object 
 10  drive         210757 non-null  object 
 11  type          230151 non-null  object 
 12  paint_color   211460 non-null  object 
 13  state         299026 non-null  object 
dtypes: float64(2), int64(1), object(11)
memory usage: 34.2+ MB


### Handling and Filtering out Outliars using IQR Method

In [46]:
# Calculate Q1 and Q3 for price
Q1 = df1['price'].quantile(0.25)
Q3 = df1['price'].quantile(0.75)
IQR = Q3 - Q1

# Define bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter the DataFrame
df1 = df1[(df1['price'] >= lower_bound) & (df1['price'] <= upper_bound)]

In [47]:

# Identifying unique values
df1.nunique()

price            14152
year               113
manufacturer        42
model            28693
condition            6
cylinders            8
fuel                 5
odometer        103381
title_status         6
transmission         3
drive                3
type                13
paint_color         12
state               51
dtype: int64

In [48]:
#checking for null values
df1.isnull().sum()


price                0
year               698
manufacturer     12245
model             3937
condition       109060
cylinders       112664
fuel              1831
odometer          2657
title_status      4578
transmission      1541
drive            86631
type             67113
paint_color      85380
state                0
dtype: int64

In [49]:
#checking for uniques values in cylinders column
df1['cylinders'].unique()


array([nan, '8 cylinders', '6 cylinders', '4 cylinders', '5 cylinders',
       'other', '3 cylinders', '10 cylinders', '12 cylinders'],
      dtype=object)

### Fill na values in Year and Odometer and covert to int datatype

In [50]:
df1['year'] = df1['year'].fillna(df['year'].median()).astype(int)
df1['odometer'] = df1['odometer'].fillna(df['odometer'].median()).astype(int)


In [51]:
# replacing Other, nan, None, nan.0 with NaN
df1['cylinders']= df1['cylinders'].replace(['other', 'nan', 'None', 'nan.0'], np.nan)

In [52]:


df1['cylinders'].unique()

array([nan, '8 cylinders', '6 cylinders', '4 cylinders', '5 cylinders',
       '3 cylinders', '10 cylinders', '12 cylinders'], dtype=object)

In [53]:
cols1= ['model','condition','drive','type','paint_color','cylinders','state','manufacturer','fuel','title_status','transmission']


In [54]:
# Replacing the null values with the mode of the column
for col in cols1:
    df1[col]=df1[col].fillna(df1[col].mode()[0])


In [55]:

# extrct numbers from the string
df1['cylinders'] = df1['cylinders'].astype(str).str.extract(r'(\d+)')
# convert to numeric
df1['cylinders'] = pd.to_numeric(df1['cylinders']).astype(int)


In [56]:
df1


Unnamed: 0,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,type,paint_color,state
0,6000,2013,ford,f-150,good,6,gas,85548,clean,automatic,4wd,sedan,white,az
1,11900,2013,ford,f-150,good,6,gas,85548,clean,automatic,4wd,sedan,white,ar
2,21000,2013,ford,f-150,good,6,gas,85548,clean,automatic,4wd,sedan,white,fl
3,1500,2013,ford,f-150,good,6,gas,85548,clean,automatic,4wd,sedan,white,ma
4,4900,2013,ford,f-150,good,6,gas,85548,clean,automatic,4wd,sedan,white,nc
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
426875,23590,2019,nissan,maxima s sedan 4d,good,6,gas,32226,clean,other,fwd,sedan,white,wy
426876,30590,2020,volvo,s60 t5 momentum sedan 4d,good,6,gas,12029,clean,other,fwd,sedan,red,wy
426877,34990,2020,cadillac,xt4 sport suv 4d,good,6,diesel,4174,clean,other,4wd,hatchback,white,wy
426878,28990,2018,lexus,es 350 sedan 4d,good,6,gas,30112,clean,other,fwd,sedan,silver,wy


In [57]:
cols3 = ['model','condition','drive','type','paint_color','state','manufacturer','fuel','title_status','transmission']

### Encoding columns using LabelEncoder

In [58]:
encoders = {}

for col in cols3:
    le = LabelEncoder()
    df1[col] = le.fit_transform(df1[col])
    encoders[col] = le 

In [59]:
df1

Unnamed: 0,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,type,paint_color,state
0,6000,2013,13,13789,2,6,2,85548,0,0,0,9,10,3
1,11900,2013,13,13789,2,6,2,85548,0,0,0,9,10,2
2,21000,2013,13,13789,2,6,2,85548,0,0,0,9,10,9
3,1500,2013,13,13789,2,6,2,85548,0,0,0,9,10,19
4,4900,2013,13,13789,2,6,2,85548,0,0,0,9,10,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
426875,23590,2019,31,19087,2,6,2,32226,0,2,1,9,10,50
426876,30590,2020,41,22873,2,6,2,12029,0,2,1,9,8,50
426877,34990,2020,6,28377,2,6,0,4174,0,2,0,4,10,50
426878,28990,2018,23,13070,2,6,2,30112,0,2,1,9,9,50


### Seperate the target column from the Dataframe

In [60]:
x= df1.drop('price',axis=1)
y= df1['price']

### Split the data into training and testing set

In [61]:


x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

### Building Model using DecisionTree m

In [62]:
model_DT = DecisionTreeRegressor(max_depth=3,random_state=0)

In [63]:
model_DT.fit(x_train,y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,3
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,0
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [75]:
y_pred_dt = model_DT.predict(x_test)

In [76]:
y_pred_dt

array([ 8329.12292785, 18229.96340304, 19495.00666338, ...,
        8329.12292785, 12389.80004199,  8329.12292785], shape=(58337,))

In [77]:
r2_score(y_test,y_pred_dt)

0.3400030319719979

### BUilding model using RandomForest

In [33]:
model_r =RandomForestRegressor(n_estimators=200,random_state=42)

In [34]:
model_r.fit(x_train,y_train)

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [35]:
y_pred_R = model_r.predict(x_test)

In [36]:
r2_score(y_test,y_pred_R)

0.7043062098266656

In [67]:
model_xgb = XGBRegressor()

In [68]:
model_xgb.fit(x_train,y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [69]:
y_pred_xgb = model_xgb.predict(x_test)

In [71]:
r2_score(y_test,y_pred_xgb)

0.650967001914978