# Read data from file

In [1]:
import numpy as np
import pandas as pd
import json
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder


# reading the xlsx file, the data has been scraped from Dealgara.com 
# by one of our fellows in AI/ML course, posted in discussion forum 
with open("ScrappedData.xlsx", 'rb') as f:
    data = pd.read_excel(f)

data.head()

Unnamed: 0,Brand,Model,Model_year,Transmission,Engine_size(cc),Drivetrain,Fuel_type,Colour,Lot_no,Kilometer,Status,Price
0,Chevrolet,aveo,2009.0,Manual,1150.0,2WD,Petrol,golden,6.0,200,USED,Rs. 975000
1,Chevrolet,captiva,2015.0,Automatic,2000.0,4WD,Diesel,Grey,8.0,100,USED,Rs. 3000000
2,Chevrolet,aveo,2009.0,Manual,1150.0,2WD,Petrol,golden,6.0,42000,USED,Rs. 975000
3,Chevrolet,captiva,2010.0,Automatic,2000.0,4WD,Diesel,Grey,8.0,51000,USED,Rs. 3000000
4,Chevrolet,spark,2012.0,Manual,1000.0,2WD,Petrol,Sky Blue,9.0,38000,USED,Rs. 1000000


# Data preprocessing

In [2]:
# checking if empty rows or the rows having missing values exists
data[data.isnull().any(axis=1)]

Unnamed: 0,Brand,Model,Model_year,Transmission,Engine_size(cc),Drivetrain,Fuel_type,Colour,Lot_no,Kilometer,Status,Price
1199,,,,,,,,,,,,
1212,,,,,,,,,,,,
1216,,,,,,,,,,,,
1219,,,,,,,,,,,,
1220,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2189,,,,,,,,,,,,
2191,,,,,,,,,,,,
2197,,,,,,,,,,,,
2213,,,,,,,,,,,,


In [3]:
# dropping column Status as all the data is of used cars
data.drop("Status", axis=1, inplace=True)

# droping rows containing Nan/missing value
data.dropna(inplace=True)

In [4]:
# just analysing the correlation between numerical features 
data.corr()

# since Model_year and Lot_no are highly correlated, dropping Model_year
data.drop("Model_year", axis=1, inplace=True)

In [5]:
# identifying categorical data for encoding
data.dtypes

# Brand, Model, Transmission, Drivetrain, Fuel_type, Colour => categorical features
# out of which Drivetrain is ordinal type and all others nominal type
# So use Label/Ordinal Encoder for DriveTrain and OneHotEncoder for other
le = LabelEncoder()
data["Drivetrain"] = le.fit_transform(data["Drivetrain"])

In [6]:
categories = ['Brand', 'Model', 'Transmission', 'Fuel_type', 'Colour']

In [7]:
# get_dummies works similar to OneHotEncoder
encoded_data = pd.get_dummies(data, columns=categories, sparse=False)

In [8]:
# one of the kilometer values had empty string value, filling it by 0
encoded_data.loc[encoded_data['Kilometer']==" ", 'Kilometer'] = 0

In [9]:
# Kilometer is of dtype object, convert it to integer
encoded_data['Kilometer'] = encoded_data['Kilometer'].astype('str').astype('int')

In [10]:
# Defining target class
target = encoded_data.pop("Price")

# Price is expressed in rupees i.e string so converting it into integer
target = target.apply(lambda x: int(x.split(maxsplit=1)[1]))

# Model training

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(encoded_data, target, test_size=0.2, random_state=42)

In [12]:
from sklearn.linear_model import LinearRegression

# Model to predict the price of the car
MLmodel = LinearRegression()
MLmodel.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

# Model testing

In [13]:
# gives R2 score, the closer the value of R2 to 1, the better 
MLmodel.score(X_test, y_test)

0.5608481203870559