# Used Car Price Prediction using kNN and DNN 

In [1]:
#global imports
import pandas as pd
import matplotlib.pyplot as plt

## Dataset Pre-Processing

### 100,000 UK Used Car Data set

#### Importing the data

In [2]:
audi = pd.read_csv('datasets/UK_Used_Car_Set/audi.csv')
bmw = pd.read_csv('datasets/UK_Used_Car_Set/bmw.csv')
ford = pd.read_csv('datasets/UK_Used_Car_Set/ford.csv')
hyundi = pd.read_csv('datasets/UK_Used_Car_Set/hyundi.csv')
merc = pd.read_csv('datasets/UK_Used_Car_Set/merc.csv')
skoda = pd.read_csv('datasets/UK_Used_Car_Set/skoda.csv')
toyota = pd.read_csv('datasets/UK_Used_Car_Set/toyota.csv')
vauxhall = pd.read_csv('datasets/UK_Used_Car_Set/vauxhall.csv')
vw = pd.read_csv('datasets/UK_Used_Car_Set/vw.csv')

#### Exploring the dataset

In [3]:
#Storing the lengths
audi_length = len(audi)
bmw_length = len(bmw)

ford_length = len(ford)
hyundi_length = len(hyundi)
merc_length = len(merc)
skoda_length = len(skoda)
toyota_length = len(toyota)
vauxhall_length = len(vauxhall)
vw_length = len(vw)

# Print the lengths
print("Length of audi:", audi_length)
print("Length of bmw:", bmw_length)
print("Length of ford:", ford_length)
print("Length of hyundi:", hyundi_length)
print("Length of merc:", merc_length)
print("Length of skoda:", skoda_length)
print("Length of toyota:", toyota_length)
print("Length of vauxhall:", vauxhall_length)
print("Length of vw:", vw_length)

# Calculate and print the total length
total_length = sum([audi_length, bmw_length, ford_length, hyundi_length, merc_length, skoda_length, toyota_length, vauxhall_length, vw_length])
print("\nTotal size of the Dataset:", total_length)

Length of audi: 10668
Length of bmw: 10781
Length of ford: 17965
Length of hyundi: 4860
Length of merc: 13119
Length of skoda: 6267
Length of toyota: 6738
Length of vauxhall: 13632
Length of vw: 15157

Total size of the Dataset: 99187


In [4]:
audi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10668 entries, 0 to 10667
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         10668 non-null  object 
 1   year          10668 non-null  int64  
 2   price         10668 non-null  int64  
 3   transmission  10668 non-null  object 
 4   mileage       10668 non-null  int64  
 5   fuelType      10668 non-null  object 
 6   tax           10668 non-null  int64  
 7   mpg           10668 non-null  float64
 8   engineSize    10668 non-null  float64
dtypes: float64(2), int64(4), object(3)
memory usage: 750.2+ KB


In [5]:
print(f'The unique values in the Audi dataset are:\n')
def get_unique_values(dataset):
    unique_values_dict = {}
    for column in dataset.columns:
        unique_values = dataset[column].unique()
        unique_values_dict[column] = unique_values
    return unique_values_dict

unique_values_result = get_unique_values(audi)
for column, unique_values in unique_values_result.items():
    print(f"Unique values in the '{column}' column:")
    print(unique_values)
    print("\n")

The unique values in the Audi dataset are:

Unique values in the 'model' column:
[' A1' ' A6' ' A4' ' A3' ' Q3' ' Q5' ' A5' ' S4' ' Q2' ' A7' ' TT' ' Q7'
 ' RS6' ' RS3' ' A8' ' Q8' ' RS4' ' RS5' ' R8' ' SQ5' ' S8' ' SQ7' ' S3'
 ' S5' ' A2' ' RS7']


Unique values in the 'year' column:
[2017 2016 2019 2015 2014 2018 2013 2020 2004 2009 2012 2010 2007 2011
 2008 2003 2005 2002 2006 1998 1997]


Unique values in the 'price' column:
[12500 16500 11000 ... 21291 12380  3750]


Unique values in the 'transmission' column:
['Manual' 'Automatic' 'Semi-Auto']


Unique values in the 'mileage' column:
[15735 36203 29946 ...  4018  1978  8646]


Unique values in the 'fuelType' column:
['Petrol' 'Diesel' 'Hybrid']


Unique values in the 'tax' column:
[150  20  30 145 125 200   0 205 160 235 260 325 300 165 240 565 265 135
 570 555 140 330 305 155 580 290 195 115 295 220 230 280 315 535 190 540
 515]


Unique values in the 'mpg' column:
[ 55.4  64.2  67.3  49.6  58.9  61.4  70.6  60.1  57.6  52.3  53

Getting a clearer picture of what types of engine sizes there are in this dataset

In [6]:
engine_size_counts = audi['engineSize'].value_counts()
print(engine_size_counts)


2.0    5169
1.4    1594
3.0    1149
1.6     913
1.5     744
1.0     558
4.0     154
1.8     126
2.5      61
0.0      57
2.9      49
1.2      31
4.2      25
5.2      23
3.2       5
1.9       4
2.7       3
4.1       2
6.3       1
Name: engineSize, dtype: int64


Checking the total amount of cars with an engine size of 0.0, this means there are electric cars

In [7]:
datasets = [audi, bmw, ford, hyundi, merc, skoda, toyota, vauxhall, vw]
engine_size_zero = 0.0
count = 0

for df in datasets:
    engine_size_counts = df['engineSize'].value_counts()
    if 0.0 in engine_size_counts.index:
           count += engine_size_counts[0.0]

# Print the total number of engine sizes equal to 0.0 across all datasets
print(f"Total number of engine sizes equal to {engine_size_zero} across all datasets: {count}")

Total number of engine sizes equal to 0.0 across all datasets: 273


Checking for null values

In [8]:
total_null_values = 0

for df in datasets:
    null_values = df.isnull().sum()

    total_null_values += null_values.sum()

print(f"Total number of null values across all datasets: {total_null_values}")

Total number of null values across all datasets: 0


Known things to remove in pre-processing
* Engine size 0 - supposed to be for electric cars, if enough values are found this could be removed and made into it's own dataset

#### Pre-Processing the dataset

Extracting 0 value cars and importing them into another dataset

In [21]:
# 

# Initialize an empty DataFrame for electric vehicles
electric = pd.DataFrame()

# Iterate through the DataFrames
for df in datasets:
    # Extract rows where engineSize is 0
    electric_df = df[df['engineSize'] == 0]
    
    # Add the extracted rows to the 'electric' DataFrame
    electric = pd.concat([electric, electric_df], ignore_index=True)

    # Remove the extracted rows from the original DataFrame
    df.drop(electric_df.index, inplace=True)

# Add the 'electric' DataFrame to the list of DataFrames
datasets.append(electric)

In [27]:
datasets[-1].head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,tax(£)
0,Q5,2019,44790,Automatic,5886,Petrol,135.0,117.7,0.0,
1,Q3,2019,32788,Automatic,1500,Diesel,145.0,47.1,0.0,
2,Q3,2020,29944,Manual,1500,Petrol,145.0,40.9,0.0,
3,Q3,2020,33333,Automatic,1500,Diesel,145.0,47.1,0.0,
4,Q3,2020,29944,Automatic,1500,Petrol,145.0,32.5,0.0,


In [28]:
datasets[-1].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17472 entries, 0 to 17471
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         17472 non-null  object 
 1   year          17472 non-null  int64  
 2   price         17472 non-null  int64  
 3   transmission  17472 non-null  object 
 4   mileage       17472 non-null  int64  
 5   fuelType      17472 non-null  object 
 6   tax           14464 non-null  float64
 7   mpg           17472 non-null  float64
 8   engineSize    17472 non-null  float64
 9   tax(£)        3008 non-null   float64
dtypes: float64(4), int64(3), object(3)
memory usage: 1.3+ MB
