In [1]:
import numpy as np
import pandas as pd

In [2]:
df_cleaned = pd.read_csv('../data/cleaned_data_saudi_used_cars.csv')
df_cleaned.head()

Unnamed: 0,Type,Region,Make,Gear_Type,Origin,Options,Year,Engine_Size,Mileage,Negotiable,Price,Mileage_per_Year,Unnatural_High_Mileage_Flag
0,Yukon,Riyadh,GMC,Automatic,Saudi,Full,2014,8.0,80000,False,120000,10000.0,False
1,Range Rover,Riyadh,Land Rover,Automatic,Gulf Arabic,Full,2015,5.0,140000,False,260000,20000.0,False
2,Optima,Hafar Al-Batin,Kia,Automatic,Saudi,Semi Full,2015,2.4,220000,False,42000,31428.571429,False
3,CX3,Abha,Mazda,Automatic,Saudi,Semi Full,2019,2.0,25000,False,58000,8333.333333,False
4,Cayenne S,Riyadh,Porsche,Automatic,Saudi,Full,2012,4.8,189000,False,85000,18900.0,False


# Features Creation

In [3]:
df_engineered = df_cleaned.copy()

In [4]:
CURRENT_YEAR = 2022
df_engineered['Car_Age'] = CURRENT_YEAR - df_engineered['Year']
# df_engineered = df_engineered.drop(columns=['Year'])
df_engineered.head()

Unnamed: 0,Type,Region,Make,Gear_Type,Origin,Options,Year,Engine_Size,Mileage,Negotiable,Price,Mileage_per_Year,Unnatural_High_Mileage_Flag,Car_Age
0,Yukon,Riyadh,GMC,Automatic,Saudi,Full,2014,8.0,80000,False,120000,10000.0,False,8
1,Range Rover,Riyadh,Land Rover,Automatic,Gulf Arabic,Full,2015,5.0,140000,False,260000,20000.0,False,7
2,Optima,Hafar Al-Batin,Kia,Automatic,Saudi,Semi Full,2015,2.4,220000,False,42000,31428.571429,False,7
3,CX3,Abha,Mazda,Automatic,Saudi,Semi Full,2019,2.0,25000,False,58000,8333.333333,False,3
4,Cayenne S,Riyadh,Porsche,Automatic,Saudi,Full,2012,4.8,189000,False,85000,18900.0,False,10


In [5]:
df_engineered['IsVintage'] = (df_engineered['Year'] < 2000).astype(int)
df_engineered.head()

Unnamed: 0,Type,Region,Make,Gear_Type,Origin,Options,Year,Engine_Size,Mileage,Negotiable,Price,Mileage_per_Year,Unnatural_High_Mileage_Flag,Car_Age,IsVintage
0,Yukon,Riyadh,GMC,Automatic,Saudi,Full,2014,8.0,80000,False,120000,10000.0,False,8,0
1,Range Rover,Riyadh,Land Rover,Automatic,Gulf Arabic,Full,2015,5.0,140000,False,260000,20000.0,False,7,0
2,Optima,Hafar Al-Batin,Kia,Automatic,Saudi,Semi Full,2015,2.4,220000,False,42000,31428.571429,False,7,0
3,CX3,Abha,Mazda,Automatic,Saudi,Semi Full,2019,2.0,25000,False,58000,8333.333333,False,3,0
4,Cayenne S,Riyadh,Porsche,Automatic,Saudi,Full,2012,4.8,189000,False,85000,18900.0,False,10,0


In [6]:
df_engineered['IsBigEngine'] = (df_engineered['Engine_Size'] > 7.0).astype(int)
df_engineered.head()

Unnamed: 0,Type,Region,Make,Gear_Type,Origin,Options,Year,Engine_Size,Mileage,Negotiable,Price,Mileage_per_Year,Unnatural_High_Mileage_Flag,Car_Age,IsVintage,IsBigEngine
0,Yukon,Riyadh,GMC,Automatic,Saudi,Full,2014,8.0,80000,False,120000,10000.0,False,8,0,1
1,Range Rover,Riyadh,Land Rover,Automatic,Gulf Arabic,Full,2015,5.0,140000,False,260000,20000.0,False,7,0,0
2,Optima,Hafar Al-Batin,Kia,Automatic,Saudi,Semi Full,2015,2.4,220000,False,42000,31428.571429,False,7,0,0
3,CX3,Abha,Mazda,Automatic,Saudi,Semi Full,2019,2.0,25000,False,58000,8333.333333,False,3,0,0
4,Cayenne S,Riyadh,Porsche,Automatic,Saudi,Full,2012,4.8,189000,False,85000,18900.0,False,10,0,0


# Feature Selection

In [7]:
df_engineered = df_engineered.drop(columns=['Negotiable', 'Mileage_per_Year', 'Unnatural_High_Mileage_Flag', 'Year'])
df_engineered.head()

Unnamed: 0,Type,Region,Make,Gear_Type,Origin,Options,Engine_Size,Mileage,Price,Car_Age,IsVintage,IsBigEngine
0,Yukon,Riyadh,GMC,Automatic,Saudi,Full,8.0,80000,120000,8,0,1
1,Range Rover,Riyadh,Land Rover,Automatic,Gulf Arabic,Full,5.0,140000,260000,7,0,0
2,Optima,Hafar Al-Batin,Kia,Automatic,Saudi,Semi Full,2.4,220000,42000,7,0,0
3,CX3,Abha,Mazda,Automatic,Saudi,Semi Full,2.0,25000,58000,3,0,0
4,Cayenne S,Riyadh,Porsche,Automatic,Saudi,Full,4.8,189000,85000,10,0,0


Feature importance (to-do later)

# Features Transformation

In [8]:
df_engineered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3801 entries, 0 to 3800
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Type         3801 non-null   object 
 1   Region       3801 non-null   object 
 2   Make         3801 non-null   object 
 3   Gear_Type    3801 non-null   object 
 4   Origin       3801 non-null   object 
 5   Options      3801 non-null   object 
 6   Engine_Size  3801 non-null   float64
 7   Mileage      3801 non-null   int64  
 8   Price        3801 non-null   int64  
 9   Car_Age      3801 non-null   int64  
 10  IsVintage    3801 non-null   int32  
 11  IsBigEngine  3801 non-null   int32  
dtypes: float64(1), int32(2), int64(3), object(6)
memory usage: 326.8+ KB


In [9]:
TARGET_VAR = 'Price'
df_engineered['IsVintage']   = df_engineered['IsVintage'].astype('bool')
df_engineered['IsBigEngine'] = df_engineered['IsBigEngine'].astype('bool')

cat_cols = (
    df_engineered
      .select_dtypes(include=['object','bool','category'])
      .columns
      .tolist()
)

num_cols = (
    df_engineered
      .select_dtypes(include=['int64','float64'])
      .drop(columns=[TARGET_VAR])
      .columns
      .tolist()
)

print("Categorical variables:", cat_cols)
print("Numerical variables:", num_cols)
print("Target variable:", TARGET_VAR)

Categorical variables: ['Type', 'Region', 'Make', 'Gear_Type', 'Origin', 'Options', 'IsVintage', 'IsBigEngine']
Numerical variables: ['Engine_Size', 'Mileage', 'Car_Age']
Target variable: Price


In [10]:
print("\n>> category cardinalities:")
for c in cat_cols:
    print(f"   {c:12s}: {df_engineered[c].nunique()} categories")


>> category cardinalities:
   Type        : 319 categories
   Region      : 27 categories
   Make        : 56 categories
   Gear_Type   : 2 categories
   Origin      : 4 categories
   Options     : 3 categories
   IsVintage   : 2 categories
   IsBigEngine : 2 categories


In [11]:
# split them into "low-cardinality" vs "high-cardinality"
LOW_CARD_THRESH = 10
low_card_cat  = [c for c in cat_cols if df_engineered[c].nunique() <= LOW_CARD_THRESH]
high_card_cat = [c for c in cat_cols if df_engineered[c].nunique() >  LOW_CARD_THRESH]

In [12]:
# a quick sanity check
print("\n>> low-cardinal cats:",  low_card_cat)
print(">> high-cardinal cats:", high_card_cat)
print(">> numeric cols:     ",  num_cols)


>> low-cardinal cats: ['Gear_Type', 'Origin', 'Options', 'IsVintage', 'IsBigEngine']
>> high-cardinal cats: ['Type', 'Region', 'Make']
>> numeric cols:      ['Engine_Size', 'Mileage', 'Car_Age']
