# Cars 4 You

### Import the needed libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# data partition
from sklearn.model_selection import train_test_split

# filter methods
# spearman 
# chi-square
import scipy.stats as stats
from scipy.stats import chi2_contingency

# wrapper methods
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_selection import RFE

# embedded methods
from sklearn.linear_model import LassoCV

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

# set random seed for reproducibility
RSEED = 42
np.random.seed(RSEED)

### Import the dataset

In [2]:
test_data = pd.read_csv('data/test.csv', sep = ",")

## Metadata

`carID` : An attribute that contains an identifier for each car <br>
`Brand` : The car's main brand (e.g. Ford, Toyota) <br>
`model` : The car model <br>
`year`: The year of Registration of the Car <br>
`mileage` : The total reported distance travelled by the car (in miles) <br>
`tax` : The amount of road tax (in £) that, in 2020, was applicable to the car in question <br>
`fuelType`: Type of Fuel used by the car (Diesel, Petrol, Hybrid, Electric) <br>
`mpg`: Average Miles per Gallon <br>
`engineSize`: Size of Engine in liters (Cubic Decimeters) <br>
`paintQuality%`: The mechanic's assessment of the cars' overall paint quality and hull integrity (filled by the mechanic during evaluation) <br>
`previousOwners`: Number of previous registered owners of the vehicle. <br>
`hasDamage`: Boolean marker filled by the seller at the time of registration stating whether the car is damaged or not <br>
`price`: The car's price when purchased by Cars 4 You (in £) <br>

### Explore the data

In [3]:
test_data.head()

Unnamed: 0,carID,Brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize,paintQuality%,previousOwners,hasDamage
0,89856,Hyundai,I30,2022.878006,Automatic,30700.0,petrol,205.0,41.5,1.6,61.0,3.0,0.0
1,106581,VW,Tiguan,2017.0,Semi-Auto,-48190.655673,Petrol,150.0,38.2,2.0,60.0,2.0,0.0
2,80886,BMW,2 Series,2016.0,Automatic,36792.0,Petrol,125.0,51.4,1.5,94.0,2.0,0.0
3,100174,Opel,Grandland X,2019.0,Manual,5533.0,Petrol,145.0,44.1,1.2,77.0,1.0,0.0
4,81376,BMW,1 Series,2019.0,Semi-Auto,9058.0,Diesel,150.0,51.4,2.0,45.0,4.0,0.0


In [4]:
test_data.tail()

Unnamed: 0,carID,Brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize,paintQuality%,previousOwners,hasDamage
32562,105775,VW,Tiguan,2017.0,Manual,27575.0,Petrol,145.0,46.3,1.4,94.0,1.0,0.0
32563,81363,BMW,X2,2020.0,Automatic,1980.0,Petrol,145.0,34.0,2.0,39.0,3.0,0.0
32564,76833,Audi,Q5,2019.0,Semi-Auto,8297.0,Diesel,145.0,38.2,2.0,88.0,4.0,0.0
32565,91768,Mercedes,A Class,2019.0,Manual,-50755.21023,Petrol,145.0,28.5,1.3,81.0,1.0,0.0
32566,99627,Toyota,Yaris,2017.0,Automatic,11071.0,Petrol,30.0,58.0,1.3,98.0,4.0,0.0


In [5]:
# Define the variable carID as the new index
test_data.set_index('carID', inplace = True)

In [6]:
# Check the number of columns and rows of the data
test_data.shape

(32567, 12)

In [7]:
# Check the name of each column of the data
test_data.columns

Index(['Brand', 'model', 'year', 'transmission', 'mileage', 'fuelType', 'tax',
       'mpg', 'engineSize', 'paintQuality%', 'previousOwners', 'hasDamage'],
      dtype='object')

In [8]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32567 entries, 89856 to 99627
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Brand           31918 non-null  object 
 1   model           31917 non-null  object 
 2   year            31914 non-null  float64
 3   transmission    31944 non-null  object 
 4   mileage         31878 non-null  float64
 5   fuelType        31911 non-null  object 
 6   tax             29259 non-null  float64
 7   mpg             29279 non-null  float64
 8   engineSize      31939 non-null  float64
 9   paintQuality%   31942 non-null  float64
 10  previousOwners  31970 non-null  float64
 11  hasDamage       31970 non-null  float64
dtypes: float64(8), object(4)
memory usage: 3.2+ MB


In [9]:
# Calculate the basic descriptive statistics of the data
test_data.describe(include = "all")

Unnamed: 0,Brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize,paintQuality%,previousOwners,hasDamage
count,31918,31917,31914.0,31944,31878.0,31911,29259.0,29279.0,31939.0,31942.0,31970.0,31970.0
unique,64,593,,38,,29,,,,,,
top,Ford,Focus,,Manual,,Petrol,,,,,,
freq,6360,2721,,16312,,16113,,,,,,
mean,,,2017.102299,,22952.658921,,120.569239,55.210728,1.665377,64.446667,2.006118,0.0
std,,,2.207969,,22132.758713,,65.56057,17.644635,0.574467,21.142188,1.47231,0.0
min,,,1991.0,,-58540.574478,,-91.12163,-43.421768,-0.103493,1.638913,-2.34565,0.0
25%,,,2016.0,,7298.25,,125.0,46.3,1.2,47.0,1.0,0.0
50%,,,2017.0,,17225.5,,145.0,54.3,1.6,65.0,2.0,0.0
75%,,,2019.0,,32500.0,,145.0,62.8,2.0,82.0,3.0,0.0


In [10]:
# Get the unique values for each column of the data
cols = test_data.columns
for name in cols:
    print(f"{name}: \n{test_data[name].unique()}")

Brand: 
['Hyundai' 'VW' 'BMW' 'Opel' 'Ford' 'Mercedes' 'Skoda' 'Toyot' 'Toyota'
 'Audi' nan 'For' 'Ope' 'toyota' 'vw' 'hyundai' 'MW' 'SKODA' 'ord' 'udi'
 'bmw' 'V' 'BM' 'HYUNDAI' 'OPEL' 'mercedes' 'audi' 'Mercede' 'pel' 'opel'
 'FORD' 'yundai' 'ford' 'Aud' 'oyota' 'MERCEDES' 'ercedes' 'AUDI' 'koda'
 'Hyunda' 'W' 'skoda' 'Skod' 'ercede' 'TOYOTA' 'ERCEDES' 'kod' 'ORD' 'v'
 'ud' 'M' 'FOR' 'for' 'MERCEDE' 'YUNDAI' 'PEL' 'ope' 'or' 'TOYOT' 'hyunda'
 'oyot' 'UDI' 'mw' 'pe' 'bm']
model: 
[' I30' ' Tiguan' ' 2 Series' ' Grandland X' '1 Series' ' Fiesta' ' X1'
 ' B Class' ' Focus' ' Superb' ' 5 Series' ' C Class' ' Up' ' Aygo' 'Golf'
 ' M CLAS' ' Land Cruiser' ' TT' ' Adam' ' Zafira' ' E Class' ' Golf'
 ' 3 Series' ' IX20' ' A4' ' Yaris' ' Passat' ' I10' ' Mokka X'
 ' EcoSport' ' 1 Series' ' 4 Series' ' A7' ' Corsa' ' Kuga' ' Grand C-MAX'
 ' Q2' ' M4' ' A Class' ' RAV4' ' Fabia' ' Insignia' ' A1' ' X6' ' Meriva'
 ' Caravelle' ' Octavia' ' Auris' ' X-CLASS' ' FOCUS' ' Astra' ' V Class'
 ' Polo' 

In [14]:
# Check for any duplicated observations
test_data.duplicated().sum()

np.int64(0)

In [11]:
# Check the data types of the variables
test_data.dtypes

Brand              object
model              object
year              float64
transmission       object
mileage           float64
fuelType           object
tax               float64
mpg               float64
engineSize        float64
paintQuality%     float64
previousOwners    float64
hasDamage         float64
dtype: object

In [12]:
# Check for any missing values
test_data.isna().sum()

Brand              649
model              650
year               653
transmission       623
mileage            689
fuelType           656
tax               3308
mpg               3288
engineSize         628
paintQuality%      625
previousOwners     597
hasDamage          597
dtype: int64

We have a lot of missing values, so let's take care of them. But, first, let's divide the columns into metric and non-metric features.

In [15]:
non_metric_features = ["Brand", "model", "transmission", "fuelType"]
metric_features = test_data.columns.drop(non_metric_features).to_list()

In [16]:
# Calculate the mean for each metric feature
means = test_data[metric_features].mean()
means

year               2017.102299
mileage           22952.658921
tax                 120.569239
mpg                  55.210728
engineSize            1.665377
paintQuality%        64.446667
previousOwners        2.006118
hasDamage             0.000000
dtype: float64

In [18]:
# Calculate the median for each metric feature
medians = test_data[metric_features].median()
medians

year               2017.0
mileage           17225.5
tax                 145.0
mpg                  54.3
engineSize            1.6
paintQuality%        65.0
previousOwners        2.0
hasDamage             0.0
dtype: float64

In [20]:
# Do an histogram to choose which method we will use to fill the missing values with
