In [324]:
# import modules and libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [325]:
# import dataset
data = pd.read_csv('car_data.csv')
data.head(10)

Unnamed: 0,name,price,mileage,colour,transmission
0,2000 BUICK CENTURY CUSTOM Sedan 4 Door,357,1.0,CHAMPAGNE,Automatic
1,2004 HONDA CIVIC LX,850,134095.0,GRAY,AUTOMATIC
2,1993 FORD MUSTANG LX,975,99086.0,WHITE,AUTOMATIC
3,1998 HONDA CR-V LX,925,194018.0,BLUE,AUTOMATIC
4,1999 CHEVROLET TAHOE K1500 Wagon 4 Door,750,264054.0,GRAY,Automatic
5,2004 HONDA CIVIC EX,1000,0.0,GRAY,AUTOMATIC
6,2009 DODGE JOURNEY SE Wagon 4 Door,1000,216676.0,GREEN,Automatic
7,2002 HONDA ACCORD EX,725,218985.0,BLACK,AUTOMATIC
8,2002 DODGE DAKOTA BAS,900,157868.0,BLUE,AUTOMATIC
9,2006 HYUNDAI ACCENT GLS,950,145902.0,BLUE,AUTOMATIC


In [326]:
#get the shape of the data
data.shape

(2054, 5)

In [327]:
#get the statistical description of the float columns
data.describe()

Unnamed: 0,price,mileage
count,2054.0,2034.0
mean,762.33739,159460.018191
std,167.18684,76203.903076
min,300.0,0.0
25%,650.0,132101.0
50%,775.0,165667.5
75%,900.0,206145.0
max,1000.0,415811.0


In [328]:
data[['name', 'colour', 'transmission']].describe()

Unnamed: 0,name,colour,transmission
count,2054,2043,2036
unique,455,23,6
top,2006 CADILLAC DTS Sedan 4 Door,SILVER,AUTOMATIC
freq,24,362,978


In [329]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2054 entries, 0 to 2053
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          2054 non-null   object 
 1   price         2054 non-null   int64  
 2   mileage       2034 non-null   float64
 3   colour        2043 non-null   object 
 4   transmission  2036 non-null   object 
dtypes: float64(1), int64(1), object(3)
memory usage: 80.4+ KB


In [330]:
data.isnull().sum()

name             0
price            0
mileage         20
colour          11
transmission    18
dtype: int64

We have to deal with the null values in the mileage, colour and transmission columns.

## Name

Let us create two new columns. One called 'make', that will take the cars manufacturers and a second called 'year' that will take the year the car was produced.

In [331]:
data['make'] = data['name'].apply(lambda x: x.split(' ')[1])
data['make'].head()

0        BUICK
1        HONDA
2         FORD
3        HONDA
4    CHEVROLET
Name: make, dtype: object

In [332]:
data.make.unique()

array(['BUICK', 'HONDA', 'FORD', 'CHEVROLET', 'DODGE', 'HYUNDAI',
       'CHRYSLER', 'DATSUN', 'CADILLAC', 'BMW', 'ACURA', 'AUDI',
       'Chevrolet', 'JEEP', 'KIA', 'GMC', 'MERCURY', 'LINCOLN', 'SUBARU',
       'NISSAN', 'OLDSMOBILE', 'TOYOTA', 'SATURN', 'MERCEDES-BENZ',
       'MAZDA', 'LEXUS', 'INFINITI', 'MITSUBISHI', 'SUZUKI', 'LAND',
       'PONTIAC'], dtype=object)

This column looks good but the "LAND" value is not the name of a car manufacturer. We will check the name column to see the correct name of the manufacturer. Also, we have to change Chevrolet to CHEVROLET

In [333]:
#we will get the name of the car with the make called LAND
data[data['make'] == 'LAND']['name']

1471    2008 LAND ROVER LR2 SE TEC
1883    2008 LAND ROVER LR2 SE TEC
Name: name, dtype: object

The correct name is "LAND ROVER", so we have to change the value to show the correct name 

In [334]:
data['make'] = data['make'].replace('LAND', 'LAND-ROVER')

Next, we change the string Chevrolet to CHEVROLET

In [335]:
data['make'] = data['make'].replace('Chevrolet', 'CHEVROLET')

Next, we extract the year from the 'name' column

In [336]:
data['year'] = data['name'].apply(lambda x: x.split(' ')[0])
data['year'].head()

0    2000
1    2004
2    1993
3    1998
4    1999
Name: year, dtype: object

In [337]:
data.head()

Unnamed: 0,name,price,mileage,colour,transmission,make,year
0,2000 BUICK CENTURY CUSTOM Sedan 4 Door,357,1.0,CHAMPAGNE,Automatic,BUICK,2000
1,2004 HONDA CIVIC LX,850,134095.0,GRAY,AUTOMATIC,HONDA,2004
2,1993 FORD MUSTANG LX,975,99086.0,WHITE,AUTOMATIC,FORD,1993
3,1998 HONDA CR-V LX,925,194018.0,BLUE,AUTOMATIC,HONDA,1998
4,1999 CHEVROLET TAHOE K1500 Wagon 4 Door,750,264054.0,GRAY,Automatic,CHEVROLET,1999


## Colour

In [338]:
data['colour'].unique()

array(['CHAMPAGNE', 'GRAY', 'WHITE', 'BLUE', 'GREEN', 'BLACK', 'BEIGE',
       'RED', 'SILVER', 'PURPLE', 'MAROON', 'TEAL', 'BROWN', 'TAN',
       'BURGUNDY', 'GOLD', 'DARK BLUE', 'LIGHT BLUE', 'YELLOW', 'ORANGE',
       'CHARCOAL', nan, 'TWO TONE', 'CREAM'], dtype=object)

The colour column is okay

## Transmission

In [339]:
data['transmission'].unique()

array(['Automatic', 'AUTOMATIC', 'MANUAL', 'Missing', 'Unknown', 'Manual',
       nan], dtype=object)

This column contains wrong words like 'Missing' and 'Unknown'. We have to first change them to nan

In [340]:
data['transmission'] = data['transmission'].replace('Missing', np.NaN)
data['transmission'] = data['transmission'].replace('Unknown', np.NaN)

Now, we change 'Automatic' to 'AUTOMATIC' and 'Manual' to 'MANUAL'

In [341]:
data['transmission'] = data['transmission'].replace('Automatic', 'AUTOMATIC')
data['transmission'] = data['transmission'].replace('Manual', 'MANUAL')

We can't work with the year column like that, we have to convert it to age so that it will be easy to understand

In [342]:
#convert the year column to integer
data['year']=data['year'].astype(str).astype(int)

In [343]:
Current_Year = 2021

In [344]:
data['age'] = Current_Year - data['year']

In [345]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2054 entries, 0 to 2053
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          2054 non-null   object 
 1   price         2054 non-null   int64  
 2   mileage       2034 non-null   float64
 3   colour        2043 non-null   object 
 4   transmission  1897 non-null   object 
 5   make          2054 non-null   object 
 6   year          2054 non-null   int32  
 7   age           2054 non-null   int32  
dtypes: float64(1), int32(2), int64(1), object(4)
memory usage: 112.5+ KB


Reorder the columns to make the dataframe easy to understand

In [346]:
data = data[['name', 'make', 'year', 'age', 'mileage', 'transmission', 'colour', 'price']]
data.head()

Unnamed: 0,name,make,year,age,mileage,transmission,colour,price
0,2000 BUICK CENTURY CUSTOM Sedan 4 Door,BUICK,2000,21,1.0,AUTOMATIC,CHAMPAGNE,357
1,2004 HONDA CIVIC LX,HONDA,2004,17,134095.0,AUTOMATIC,GRAY,850
2,1993 FORD MUSTANG LX,FORD,1993,28,99086.0,AUTOMATIC,WHITE,975
3,1998 HONDA CR-V LX,HONDA,1998,23,194018.0,AUTOMATIC,BLUE,925
4,1999 CHEVROLET TAHOE K1500 Wagon 4 Door,CHEVROLET,1999,22,264054.0,AUTOMATIC,GRAY,750


In [347]:
# Replace missing values using mode and median for categorical and numerical features respectively
data['mileage'] = data['mileage'].fillna(data['mileage'].mean())
data['colour'] = data['colour'].fillna(data['colour'].mode()[0])
data['transmission'] = data['transmission'].fillna(data['transmission'].mode()[0])

In [348]:
data.isnull().sum()

name            0
make            0
year            0
age             0
mileage         0
transmission    0
colour          0
price           0
dtype: int64

In [349]:
data.to_csv('cleaned_car_data.csv', index = False)