# SC1015 DSAI Mini Project 

# Part 1: Data Cleaning and preparation

In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
sb.set() # set the default Seaborn style for graphics

# Setup : Import the Dataset

In [9]:
cardata = pd.read_csv('ford.csv')
cardata

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,Fiesta,2017,12000,Automatic,15944,Petrol,150,57.7,1.0
1,Focus,2018,14000,Manual,9083,Petrol,150,57.7,1.0
2,Focus,2017,13000,Manual,12456,Petrol,150,57.7,1.0
3,Fiesta,2019,17500,Manual,10460,Petrol,145,40.3,1.5
4,Fiesta,2019,16500,Automatic,1482,Petrol,145,48.7,1.0
...,...,...,...,...,...,...,...,...,...
17961,B-MAX,2017,8999,Manual,16700,Petrol,150,47.1,1.4
17962,B-MAX,2014,7499,Manual,40700,Petrol,30,57.7,1.0
17963,Focus,2015,9999,Manual,7010,Diesel,20,67.3,1.6
17964,KA,2018,8299,Manual,5007,Petrol,145,57.7,1.2


In [11]:
cardata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17966 entries, 0 to 17965
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         17966 non-null  object 
 1   year          17966 non-null  int64  
 2   price         17966 non-null  int64  
 3   transmission  17966 non-null  object 
 4   mileage       17966 non-null  int64  
 5   fuelType      17966 non-null  object 
 6   tax           17966 non-null  int64  
 7   mpg           17966 non-null  float64
 8   engineSize    17966 non-null  float64
dtypes: float64(2), int64(4), object(3)
memory usage: 1.2+ MB


# 1. Feature Selection

In [15]:
cardata_subset = cardata[[
    'model',
    'year',
    'price',
    'transmission',
    'mileage',
    'fuelType',
    'engineSize']]
cardata_subset

Unnamed: 0,model,year,price,transmission,mileage,fuelType,engineSize
0,Fiesta,2017,12000,Automatic,15944,Petrol,1.0
1,Focus,2018,14000,Manual,9083,Petrol,1.0
2,Focus,2017,13000,Manual,12456,Petrol,1.0
3,Fiesta,2019,17500,Manual,10460,Petrol,1.5
4,Fiesta,2019,16500,Automatic,1482,Petrol,1.0
...,...,...,...,...,...,...,...
17961,B-MAX,2017,8999,Manual,16700,Petrol,1.4
17962,B-MAX,2014,7499,Manual,40700,Petrol,1.0
17963,Focus,2015,9999,Manual,7010,Diesel,1.6
17964,KA,2018,8299,Manual,5007,Petrol,1.2


# 2. Dropping NaN Values

In [18]:
# Check if NaN values exists
cardata_subset.isnull().values.any()

False

# 3. Converting "year" as object type

In [23]:
cardata_subset['year'].astype('object')
cardata_subset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17966 entries, 0 to 17965
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         17966 non-null  object 
 1   year          17966 non-null  object 
 2   price         17966 non-null  int64  
 3   transmission  17966 non-null  object 
 4   mileage       17966 non-null  int64  
 5   fuelType      17966 non-null  object 
 6   engineSize    17966 non-null  float64
dtypes: float64(1), int64(2), object(4)
memory usage: 982.6+ KB


# 4. Split Dataset into Numerical and Categorical

In [29]:
car_numeric_data = cardata_subset.select_dtypes('number')
car_cat_data = cardata_subset.select_dtypes('object')
print(car_numeric_data.columns)
print(car_cat_data.columns)

Index(['price', 'mileage', 'engineSize'], dtype='object')
Index(['model', 'year', 'transmission', 'fuelType'], dtype='object')


# 5. Converting cleaned data into new csv file

In [32]:
cardata_subset.to_csv("cleaned_cardata.csv", sep=',', encoding='utf-8')