In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer

# Read in data

In [2]:
df = pd.read_csv('data/Australian Vehicle Prices.csv')

df.head()

Unnamed: 0,Brand,Year,Model,Car/Suv,Title,UsedOrNew,Transmission,Engine,DriveType,FuelType,FuelConsumption,Kilometres,ColourExtInt,Location,CylindersinEngine,BodyType,Doors,Seats,Price
0,Ssangyong,2022.0,Rexton,Sutherland Isuzu Ute,2022 Ssangyong Rexton Ultimate (awd),DEMO,Automatic,"4 cyl, 2.2 L",AWD,Diesel,8.7 L / 100 km,5595,White / Black,"Caringbah, NSW",4 cyl,SUV,4 Doors,7 Seats,51990
1,MG,2022.0,MG3,Hatchback,2022 MG MG3 Auto Excite (with Navigation),USED,Automatic,"4 cyl, 1.5 L",Front,Premium,6.7 L / 100 km,16,Black / Black,"Brookvale, NSW",4 cyl,Hatchback,5 Doors,5 Seats,19990
2,BMW,2022.0,430I,Coupe,2022 BMW 430I M Sport,USED,Automatic,"4 cyl, 2 L",Rear,Premium,6.6 L / 100 km,8472,Grey / White,"Sylvania, NSW",4 cyl,Coupe,2 Doors,4 Seats,108988
3,Mercedes-Benz,2011.0,E500,Coupe,2011 Mercedes-Benz E500 Elegance,USED,Automatic,"8 cyl, 5.5 L",Rear,Premium,11 L / 100 km,136517,White / Brown,"Mount Druitt, NSW",8 cyl,Coupe,2 Doors,4 Seats,32990
4,Renault,2022.0,Arkana,SUV,2022 Renault Arkana Intens,USED,Automatic,"4 cyl, 1.3 L",Front,Unleaded,6 L / 100 km,1035,Grey / Black,"Castle Hill, NSW",4 cyl,SUV,4 Doors,5 Seats,34990


In [3]:
df.tail()

Unnamed: 0,Brand,Year,Model,Car/Suv,Title,UsedOrNew,Transmission,Engine,DriveType,FuelType,FuelConsumption,Kilometres,ColourExtInt,Location,CylindersinEngine,BodyType,Doors,Seats,Price
16729,Toyota,2014.0,Alphard,SYC Cars,2014 Toyota Alphard,USED,Automatic,-,Front,Unleaded,-,66000,Grey / -,"Rocklea, QLD",-,Commercial,7 Seats,,29500
16730,Mercedes-Benz,2012.0,S-Class,USED Dealer ad,2012 Mercedes-Benz S-Class CDI BlueTEC,USED,Automatic,-,Rear,Diesel,-,117432,Silver / -,"Port Melbourne, VIC",-,,,,39888
16731,Suzuki,2022.0,Vitara,Bremer Suzuki - New,2022 Suzuki Vitara 1.6L,DEMO,Automatic,"4 cyl, 1.6 L",Front,Unleaded,6.2 L / 100 km,5165,Yellow / -,"Dinmore, QLD",4 cyl,SUV,4 Doors,5 Seats,35280
16732,Mercedes-Benz,2016.0,GLC250,SUV,2016 Mercedes-Benz GLC250,USED,Automatic,"4 cyl, 2 L",AWD,Premium,7.2 L / 100 km,85525,Grey / Black,"Albion, QLD",4 cyl,SUV,4 Doors,5 Seats,41888
16733,Mercedes-Benz,2021.0,C200,Sedan,2021 Mercedes-Benz C200,USED,Automatic,"4 cyl, 2 L",Rear,Unleaded,7 L / 100 km,31852,Gold / -,"Yatala, QLD",4 cyl,Sedan,4 Doors,5 Seats,65888


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16734 entries, 0 to 16733
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Brand              16733 non-null  object 
 1   Year               16733 non-null  float64
 2   Model              16733 non-null  object 
 3   Car/Suv            16706 non-null  object 
 4   Title              16733 non-null  object 
 5   UsedOrNew          16733 non-null  object 
 6   Transmission       16733 non-null  object 
 7   Engine             16733 non-null  object 
 8   DriveType          16733 non-null  object 
 9   FuelType           16733 non-null  object 
 10  FuelConsumption    16733 non-null  object 
 11  Kilometres         16733 non-null  object 
 12  ColourExtInt       16733 non-null  object 
 13  Location           16284 non-null  object 
 14  CylindersinEngine  16733 non-null  object 
 15  BodyType           16452 non-null  object 
 16  Doors              151

# Preprocess the data

In [5]:
# Separate the `Engine column` to 'Engine_cylinder_number' and 'Engine_total_volume' 
df[['Engine_cylinder_number', 'Engine_total_volume']] = df['Engine'].str.split(',', expand=True)

In [6]:
# Change ColourExtInt to only ExteriorColour
df['ExteriorColour'] = df['ColourExtInt'].str.split('/').str[0]

In [7]:
# Separate the `FuelConsumption` to `fuel_comsumption_liter` and `fuel_comsumption_km` 
df[['fuel_comsumption_liter', 'fuel_comsumption_km']] = df['FuelConsumption'].str.split('/', expand=True)

In [8]:
# Clean values for `kilometers` column
df['Kilometres'] = df['Kilometres'].str.replace('-', '').str.replace('/', '').str.replace(' ', '')
df['Kilometres'].replace('', np.nan, inplace=True)
df['Kilometres'] = df['Kilometres'].astype(float)

In [9]:
# Clean values for `price` column
df['Price'].replace('POA','', inplace=True)
df = df.replace('', np.nan)
df['Price'] = df['Price'].astype(float)

In [10]:
# Clean `Transmission` column
df['Transmission'].replace('-','', inplace=True)
df = df.replace('', np.nan)

In [11]:
# Clean `Year` column
df['Year'] = df['Year'].astype(str).str[:4]
df = df.replace('nan', np.nan)
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

In [12]:
# Clean values for `Engine_cylinder_number` and `Engine_total_volume` columns
df['Engine_cylinder_number'] = df['Engine_cylinder_number'].str.replace('cyl', '').str.replace('0 L', '').str.replace('2 L', '').str.replace('-', '')
df['Engine_total_volume'] = df['Engine_total_volume'].str.replace('L', '')
df = df.replace('', np.nan)
df['Engine_cylinder_number'] = df['Engine_cylinder_number'].astype(float)
df['Engine_total_volume'] = df['Engine_total_volume'].astype(float)

In [13]:
# Clean values for Doors column
df['Doors'] = df['Doors'].str.replace('Doors', '').str.replace('12 Seats', '6').str.replace('3 Seats', '2').str.replace('6 Seats', '4').str.replace('9 Seats', '6').str.replace('8 Seats', '4').str.replace('2 Seats', '2').str.replace('4 Seats', '4').str.replace('5 Seats', '3').str.replace('7 Seats', '4').str.replace('Seats', '')
df['Doors'] = df['Doors'].astype(float)

In [14]:
# Clean values for `Seats` column 
df['Seats'] = df['Seats'].str.replace('Seats', '')
df['Seats'] = df['Seats'].astype(float)

In [15]:
# Clean values for `Body` column
df['BodyType'] = df['BodyType'].str.replace('/', '')

In [16]:
# Clean values for `ExteriorColour` column
df['ExteriorColour'] = df['ExteriorColour'].str.replace('-', '')

In [17]:
df['fuel_comsumption_km'].unique()

array([' 100 km', None, nan], dtype=object)

In [18]:
# Clean values for `fuel_comsumption_liter` and `fuel_comsumption_km` columns
df['fuel_comsumption_liter'] = df['fuel_comsumption_liter'].str.replace('L', '')
df['fuel_comsumption_km'] = df['fuel_comsumption_km'].str.replace('km', '')
df['fuel_comsumption_liter'] = df['fuel_comsumption_liter'].str.replace('-', '')
df = df.replace('', np.nan)
df['fuel_comsumption_liter'] = df['fuel_comsumption_liter'].astype(float)
df['fuel_comsumption_km'] = df['fuel_comsumption_km'].astype(float)

In [19]:
df['FuelType'].unique()

array(['Diesel', 'Premium', 'Unleaded', 'Hybrid', '-', 'Other',
       'Electric', 'LPG', 'Leaded', nan], dtype=object)

In [20]:
# Clean values for 'FuelType' column
df['FuelType'] = df['FuelType'].str.replace('-', 'Other')

# Drop columns

In [21]:
drop_columns = ['Car/Suv', # Not useful
                'Title', # Not useful
                'Model', # Not useful
                'Location', # Not useful
                'Engine', # Seprated to 'Engine_cylinder_number' and 'Engine_total_volume'
                'FuelConsumption',  # Changed to `fuel_comsumption_liter_per100km`
                'ColourExtInt', # Changed to `ExteriorColour`
                'Location', # Not useful
                'CylindersinEngine', # Duplicated with `Engine_cylinder_number`  
]

In [22]:
# Drop the columns from the DataFrame
df.drop(columns=drop_columns, inplace=True)

In [23]:
df.head()

Unnamed: 0,Brand,Year,UsedOrNew,Transmission,DriveType,FuelType,Kilometres,BodyType,Doors,Seats,Price,Engine_cylinder_number,Engine_total_volume,ExteriorColour,fuel_comsumption_liter,fuel_comsumption_km
0,Ssangyong,2022.0,DEMO,Automatic,AWD,Diesel,5595.0,SUV,4.0,7.0,51990.0,4.0,2.2,White,8.7,100.0
1,MG,2022.0,USED,Automatic,Front,Premium,16.0,Hatchback,5.0,5.0,19990.0,4.0,1.5,Black,6.7,100.0
2,BMW,2022.0,USED,Automatic,Rear,Premium,8472.0,Coupe,2.0,4.0,108988.0,4.0,2.0,Grey,6.6,100.0
3,Mercedes-Benz,2011.0,USED,Automatic,Rear,Premium,136517.0,Coupe,2.0,4.0,32990.0,8.0,5.5,White,11.0,100.0
4,Renault,2022.0,USED,Automatic,Front,Unleaded,1035.0,SUV,4.0,5.0,34990.0,4.0,1.3,Grey,6.0,100.0


# Deal with nan values

In [24]:
# Find the percentage of nan value in each column
df.isnull().sum()/df.shape[0]*100

Brand                      0.005976
Year                       0.005976
UsedOrNew                  0.005976
Transmission               1.505916
DriveType                  0.005976
FuelType                   0.005976
Kilometres                 3.537708
BodyType                   1.685192
Doors                      9.585275
Seats                     10.188837
Price                      0.316720
Engine_cylinder_number    10.648978
Engine_total_volume       10.654954
ExteriorColour             0.005976
fuel_comsumption_liter    10.147006
fuel_comsumption_km       10.147006
dtype: float64

In [25]:
numerical_columns = df.select_dtypes(include=['number']).columns.tolist()
numerical_df = df[numerical_columns]
knnimputer = KNNImputer(n_neighbors=5)  
numerical_cln_df = knnimputer.fit_transform(numerical_df)  
numerical_cln_df = pd.DataFrame(numerical_cln_df, columns=numerical_df.columns)
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
categorical_df = df[categorical_columns]
categorical_imputer = SimpleImputer(strategy='most_frequent')
categorical_cln_df = pd.DataFrame(categorical_imputer.fit_transform(categorical_df), columns=categorical_df.columns)

df=pd.merge(categorical_cln_df,numerical_cln_df,right_index=True,left_index=True)

In [26]:
df.isnull().sum()/df.shape[0]*100

Brand                     0.0
UsedOrNew                 0.0
Transmission              0.0
DriveType                 0.0
FuelType                  0.0
BodyType                  0.0
ExteriorColour            0.0
Year                      0.0
Kilometres                0.0
Doors                     0.0
Seats                     0.0
Price                     0.0
Engine_cylinder_number    0.0
Engine_total_volume       0.0
fuel_comsumption_liter    0.0
fuel_comsumption_km       0.0
dtype: float64

In [27]:
df['Year'] = df['Year'].astype(int)

In [28]:
df.sample(10)

Unnamed: 0,Brand,UsedOrNew,Transmission,DriveType,FuelType,BodyType,ExteriorColour,Year,Kilometres,Doors,Seats,Price,Engine_cylinder_number,Engine_total_volume,fuel_comsumption_liter,fuel_comsumption_km
10187,Toyota,USED,Automatic,Front,Unleaded,Hatchback,Silver,2022,31600.0,5.0,5.0,35990.0,3.0,1.5,4.9,100.0
3951,Toyota,USED,Automatic,Front,Unleaded,SUV,Red,2019,64766.0,4.0,7.0,49990.0,6.0,3.5,9.1,100.0
7500,Holden,USED,Automatic,Front,Unleaded,SUV,Grey,2017,160351.0,4.0,5.0,18799.0,4.0,1.4,6.9,100.0
13129,Chery,USED,Manual,Front,Unleaded,SUV,Silver,2011,157000.0,4.0,5.0,7999.0,4.0,2.0,8.9,100.0
5214,Lexus,USED,Automatic,Rear,Premium,Coupe,Red,2017,87809.0,4.0,5.0,49995.0,4.0,2.32,7.68,100.0
1332,Toyota,USED,Automatic,Front,Unleaded,Hatchback,White,2017,77491.0,5.0,5.0,26590.0,4.0,1.8,6.1,100.0
16579,Ford,USED,Automatic,AWD,Diesel,SUV,Red,2016,135122.0,4.0,7.0,43990.0,5.0,3.2,8.5,100.0
12540,Mitsubishi,USED,Manual,4WD,Diesel,Ute Tray,White,2019,19647.0,4.0,5.0,43880.0,4.0,2.4,7.2,100.0
1248,Hyundai,USED,Automatic,4WD,Diesel,SUV,White,2012,139524.0,4.0,7.0,21990.0,4.0,2.2,7.5,100.0
684,Hyundai,USED,Automatic,Front,Unleaded,Wagon,Grey,2017,65128.0,4.0,5.4,23990.0,4.0,1.92,7.14,100.0


In [29]:
df.to_csv('data/preprocessed_data.csv', index=False)