In [1]:
import numpy as np
import pandas as pd

In [4]:
cleaned_data = pd.read_csv('dataset/used_cars_cleaned.csv')

In [5]:
cleaned_data.head()

Unnamed: 0,region,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,...,drive,size,type,paint_color,description,state,lat,long,is_condition_imputed,is_title_status_imputed
0,auburn,33590,2014,gmc,sierra 1500 crew cab slt,good,8 cylinders,gas,57923.0,clean,...,4wd,unspecified,pickup,white,Carvana is the safer way to buy a car During t...,al,32.59,-85.48,False,False
1,auburn,22590,2010,chevrolet,silverado 1500,good,8 cylinders,gas,71229.0,clean,...,4wd,full-size,pickup,blue,Carvana is the safer way to buy a car During t...,al,32.59,-85.48,False,False
2,auburn,39590,2020,chevrolet,silverado 1500 crew,good,8 cylinders,gas,19160.0,clean,...,4wd,unspecified,pickup,red,Carvana is the safer way to buy a car During t...,al,32.59,-85.48,False,False
3,auburn,30990,2017,toyota,tundra double cab sr,good,8 cylinders,gas,41124.0,clean,...,4wd,full-size,pickup,red,Carvana is the safer way to buy a car During t...,al,32.59,-85.48,False,False
4,auburn,15000,2013,ford,f-150 xlt,excellent,6 cylinders,gas,128000.0,clean,...,rwd,full-size,truck,black,2013 F-150 XLT V6 4 Door. Good condition. Leve...,al,32.592,-85.5189,False,False


In [6]:
cleaned_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246680 entries, 0 to 246679
Data columns (total 21 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   region                   246680 non-null  object 
 1   price                    246680 non-null  int64  
 2   year                     246680 non-null  int64  
 3   manufacturer             246680 non-null  object 
 4   model                    246680 non-null  object 
 5   condition                246680 non-null  object 
 6   cylinders                246680 non-null  object 
 7   fuel                     246680 non-null  object 
 8   odometer                 246680 non-null  float64
 9   title_status             246680 non-null  object 
 10  transmission             246680 non-null  object 
 11  drive                    246680 non-null  object 
 12  size                     246680 non-null  object 
 13  type                     246680 non-null  object 
 14  pain

# Feature Engineering

Let's first drop the features we're not going to use

In [7]:
drop_features = [
    'description', 'lat', 'long'
    ]

In [8]:
modelling_data = cleaned_data.drop(columns=drop_features)

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [10]:
num_cols = modelling_data.select_dtypes(include=['int64', 'float64']).columns
cat_cols = modelling_data.select_dtypes(include=['object']).columns

In [11]:
low_cardinal_cat_cols = [col for col in cat_cols if modelling_data[col].nunique() < 20]
high_cardinal_cat_cols = [col for col in cat_cols if modelling_data[col].nunique() >= 20]

In [12]:
k = 10  # Number of top categories to display

for col in high_cardinal_cat_cols:
    map = cleaned_data[col].value_counts().head(k).index
    modelling_data[col] = modelling_data[col].apply(lambda x: x if x in map else 'other')

In [13]:
ordinal_cats = ['condition', 'title_status']
nominal_cats = [col for col in cat_cols if col not in ordinal_cats]

## Encoding

In [14]:
condition_mapping = {
    'new': 5,
    'like new': 4,
    'excellent': 3,
    'good': 2,
    'fair': 1,
    'salvage': 0
}

title_status_mapping = {
    'clean': 3,
    'rebuilt': 2,
    'salvage': 1,
    'lien': 0,
    'missing': -1,
    'parts only': -2,
    'unspecified': -3
}

In [15]:
encoded_data = modelling_data.copy()

In [16]:
for col in ordinal_cats:
    if col == 'condition':
        encoded_data[col] = encoded_data[col].map(condition_mapping)
    elif col == 'title_status':
        encoded_data[col] = encoded_data[col].map(title_status_mapping)

In [17]:
one_hot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_nominal_data = pd.DataFrame(one_hot_encoder.fit_transform(encoded_data[nominal_cats]))
encoded_nominal_data.columns = one_hot_encoder.get_feature_names_out(nominal_cats)
encoded_data = pd.concat([encoded_data.drop(columns=nominal_cats), encoded_nominal_data], axis=1)

In [18]:
encoded_data

Unnamed: 0,price,year,condition,odometer,title_status,is_condition_imputed,is_title_status_imputed,region_anchorage / mat-su,region_colorado springs,region_ft myers / SW florida,...,state_fl,state_mi,state_nc,state_ny,state_oh,state_or,state_other,state_tn,state_tx,state_wa
0,33590,2014,2,57923.0,3,False,False,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,22590,2010,2,71229.0,3,False,False,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,39590,2020,2,19160.0,3,False,False,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,30990,2017,2,41124.0,3,False,False,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,15000,2013,3,128000.0,3,False,False,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246675,23590,2019,2,32226.0,3,False,False,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
246676,30590,2020,2,12029.0,3,False,False,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
246677,34990,2020,2,4174.0,3,False,False,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
246678,28990,2018,2,30112.0,3,False,False,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [20]:
encoded_data.shape

(246680, 109)

## Train-Test Split