In [1]:
import pandas as pd

In [2]:
car_data = pd.read_csv('../data/raw/used_cars_cleaned.csv')

In [3]:
car_data.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,ford,utility police interceptor base,2013,51000.0,e85 flex fuel,300.0hp 3.7l v6 cylinder engine flex fuel capa...,6-speed a/t,black,black,at least 1 accident or damage reported,yes,10300.0
1,hyundai,palisade sel,2021,34742.0,gasoline,3.8l v6 24v gdi dohc,8-speed automatic,moonlight cloud,gray,at least 1 accident or damage reported,yes,38005.0
2,lexus,rx 350 rx 350,2022,22372.0,gasoline,3.5 liter dohc,automatic,blue,black,none reported,yes,54598.0
3,infiniti,q50 hybrid sport,2015,88900.0,hybrid,354.0hp 3.5l v6 cylinder engine gas/electric h...,7-speed a/t,black,black,none reported,yes,15500.0
4,audi,q3 45 s line premium plus,2021,9835.0,gasoline,2.0l i4 16v gdi dohc turbo,8-speed automatic,glacier white metallic,black,none reported,yes,34999.0


In [4]:
car_data.columns

Index(['brand', 'model', 'model_year', 'milage', 'fuel_type', 'engine',
       'transmission', 'ext_col', 'int_col', 'accident', 'clean_title',
       'price'],
      dtype='object')

In [5]:
car_data['car_age'] = 2025 - car_data['model_year']
car_data.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price,car_age
0,ford,utility police interceptor base,2013,51000.0,e85 flex fuel,300.0hp 3.7l v6 cylinder engine flex fuel capa...,6-speed a/t,black,black,at least 1 accident or damage reported,yes,10300.0,12
1,hyundai,palisade sel,2021,34742.0,gasoline,3.8l v6 24v gdi dohc,8-speed automatic,moonlight cloud,gray,at least 1 accident or damage reported,yes,38005.0,4
2,lexus,rx 350 rx 350,2022,22372.0,gasoline,3.5 liter dohc,automatic,blue,black,none reported,yes,54598.0,3
3,infiniti,q50 hybrid sport,2015,88900.0,hybrid,354.0hp 3.5l v6 cylinder engine gas/electric h...,7-speed a/t,black,black,none reported,yes,15500.0,10
4,audi,q3 45 s line premium plus,2021,9835.0,gasoline,2.0l i4 16v gdi dohc turbo,8-speed automatic,glacier white metallic,black,none reported,yes,34999.0,4


In [6]:
car_data['milage_per_year'] = car_data['milage'] / car_data['car_age'].replace(0, 1)


In [7]:
car_data.sample(10)

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price,car_age,milage_per_year
2114,mercedes-benz,a-class a 220 4matic,2019,30300.0,gasoline,188.0hp 2.0l 4 cylinder engine gasoline fuel,7-speed a/t,silver,black,none reported,yes,30500.0,6,5050.0
2592,bmw,z4 sdrive28i,2014,15889.0,gasoline,240.0hp 2.0l 4 cylinder engine gasoline fuel,8-speed a/t,blue,beige,none reported,yes,34500.0,11,1444.454545
3023,acura,ilx 2.0l w/premium package,2013,198000.0,gasoline,150.0hp 2.0l 4 cylinder engine gasoline fuel,transmission w/dual shift mode,white,black,at least 1 accident or damage reported,yes,11000.0,12,16500.0
2685,ram,1500 laramie,2022,12940.0,gasoline,5.7l v8 16v mpfi ohv,8-speed automatic,bright white clearcoat,black,at least 1 accident or damage reported,yes,46999.0,3,4313.333333
1034,toyota,camry se,2011,177702.0,gasoline,179.0hp 2.5l 4 cylinder engine gasoline fuel,a/t,gray,gray,at least 1 accident or damage reported,yes,7700.0,14,12693.0
659,toyota,supra a91-mt edition,2023,1500.0,gasoline,382.0hp 3.0l straight 6 cylinder engine gasoli...,6-speed m/t,gray,orange,none reported,yes,84900.0,2,750.0
2221,bmw,435 i,2014,126500.0,gasoline,300.0hp 3.0l straight 6 cylinder engine gasoli...,a/t,white,black,none reported,yes,15000.0,11,11500.0
1887,ford,expedition limited,2023,9281.0,gasoline,440.0hp 3.5l v6 cylinder engine gasoline fuel,a/t,black,black,none reported,yes,85000.0,2,4640.5
3536,hyundai,accent gls,2014,127941.0,gasoline,138.0hp 1.6l 4 cylinder engine gasoline fuel,a/t,silver,gray,none reported,yes,7500.0,11,11631.0
1421,mitsubishi,mirage es,2017,35698.0,gasoline,78.0hp 1.2l 3 cylinder engine gasoline fuel,a/t,blue,black,none reported,yes,8750.0,8,4462.25


## I just want to do some tests to see the relations

### Starting off with milage and the price

In [8]:
from scipy.stats import chi2_contingency

# Example: 4 bins each
car_data['milage_bin'] = pd.qcut(car_data['milage'], q=4, labels=['Low', 'Medium', 'High', 'Very High'])
car_data['price_bin'] = pd.qcut(car_data['price'], q=4, labels=['Cheap', 'Moderate', 'Expensive', 'Luxury'])

In [9]:
contingency_table = pd.crosstab(car_data['milage_bin'], car_data['price_bin'])
print(contingency_table)


price_bin   Cheap  Moderate  Expensive  Luxury
milage_bin                                    
Low             7        85        279     562
Medium         49       260        344     279
High          246       372        234      80
Very High     631       237         55      10


In [10]:
chi2, p, dof, expected = chi2_contingency(contingency_table)

print("Chi2 Statistic:", chi2)
print("Degrees of Freedom:", dof)
print("P-Value:", p)

# Interpretation
if p < 0.05:
    print("🔍 Mileage and Price are likely dependent (p < 0.05).")
else:
    print("✅ No strong evidence of dependency between Mileage and Price (p ≥ 0.05).")

Chi2 Statistic: 2208.894643797451
Degrees of Freedom: 9
P-Value: 0.0
🔍 Mileage and Price are likely dependent (p < 0.05).


### I'm gonna try to do the same with the age of the car and the price

In [11]:
# Example: 4 bins each
car_data['age_bin'] = pd.qcut(car_data['car_age'], q=4, labels=['New', 'Like New', 'Old', 'Very Old'])

In [12]:
contingency_table = pd.crosstab(car_data['age_bin'], car_data['price_bin'])
print(contingency_table)

price_bin  Cheap  Moderate  Expensive  Luxury
age_bin                                      
New            1       129        395     569
Like New      38       259        312     211
Old          306       389        144      99
Very Old     588       177         61      52


In [13]:
chi2, p, dof, expected = chi2_contingency(contingency_table)

print("Chi2 Statistic:", chi2)
print("Degrees of Freedom:", dof)
print("P-Value:", p)

# Interpretation
if p < 0.05:
    print("🔍 Car Age and Price are likely dependent (p < 0.05).")
else:
    print("✅ No strong evidence of dependency between Car Age and Price (p ≥ 0.05).")

Chi2 Statistic: 2033.9867846479501
Degrees of Freedom: 9
P-Value: 0.0
🔍 Car Age and Price are likely dependent (p < 0.05).


### Extracting the engine size (numerics)

In [14]:
import re
# Extract engine size in liters (e.g., "3.7L")
car_data['engine_size'] = car_data['engine'].str.extract(r'(\d+(?:\.\d+)?)\s*[lL]', flags=re.IGNORECASE).astype(float)

In [15]:
car_data.sample(10)

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price,car_age,milage_per_year,milage_bin,price_bin,age_bin,engine_size
1969,ford,f-250 xl,2014,35900.0,e85 flex fuel,385.0hp 6.2l 8 cylinder engine flex fuel capab...,transmission w/dual shift mode,blue,gray,at least 1 accident or damage reported,yes,27900.0,11,3263.636364,Medium,Moderate,Old,6.2
1739,audi,q8 55 premium plus,2021,30000.0,hybrid,335.0hp 3.0l v6 cylinder engine gasoline/mild ...,8-speed a/t,black,black,none reported,yes,62000.0,4,7500.0,Medium,Luxury,New,3.0
3419,chevrolet,silverado 1500 base,1999,88750.0,gasoline,255.0hp 4.8l 8 cylinder engine gasoline fuel,a/t,white,gray,none reported,yes,6800.0,26,3413.461538,High,Cheap,Very Old,4.8
3505,bmw,328 i xdrive,2013,52000.0,gasoline,240.0hp 2.0l 4 cylinder engine gasoline fuel,transmission w/dual shift mode,white,beige,none reported,yes,17500.0,12,4333.333333,Medium,Moderate,Old,2.0
1000,porsche,boxster rs 60 spyder,2008,58000.0,gasoline,303.0hp 3.4l flat 6 cylinder engine gasoline fuel,6-speed m/t,silver,black,none reported,yes,41500.0,17,3411.764706,High,Expensive,Very Old,3.4
588,bmw,m4 competition xdrive,2023,1120.0,gasoline,503.0hp 3.0l straight 6 cylinder engine gasoli...,8-speed a/t,gray,white,none reported,yes,92900.0,2,560.0,Low,Luxury,New,3.0
223,cadillac,dts luxury ii,2008,94155.0,gasoline,275.0hp 4.6l 8 cylinder engine gasoline fuel,4-speed a/t,white,beige,none reported,yes,10500.0,17,5538.529412,High,Cheap,Very Old,4.6
1909,ford,mustang gt,2014,38560.0,gasoline,420.0hp 5.0l 8 cylinder engine gasoline fuel,6-speed m/t,black,black,at least 1 accident or damage reported,yes,28500.0,11,3505.454545,Medium,Moderate,Old,5.0
3073,porsche,911 gt3,2022,1750.0,gasoline,502.0hp 4.0l flat 6 cylinder engine gasoline fuel,7-speed a/t,white,black,none reported,yes,275000.0,3,583.333333,Low,Luxury,New,4.0
64,cadillac,ct5 premium luxury,2023,6090.0,gasoline,237.0hp 2.0l 4 cylinder engine gasoline fuel,a/t,white,beige,none reported,yes,46900.0,2,3045.0,Low,Expensive,New,2.0


In [16]:
car_data.to_csv('../data/raw/used_cars_engineered.csv', index=False)