In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('data/cars.csv')
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   brand          8128 non-null   object
 1   km_driven      8128 non-null   int64 
 2   fuel           8128 non-null   object
 3   owner          8128 non-null   object
 4   selling_price  8128 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 317.6+ KB


In [4]:
df.describe()

Unnamed: 0,km_driven,selling_price
count,8128.0,8128.0
mean,69819.51,638271.8
std,56550.55,806253.4
min,1.0,29999.0
25%,35000.0,254999.0
50%,60000.0,450000.0
75%,98000.0,675000.0
max,2360457.0,10000000.0


In [5]:
df.isnull().sum()

brand            0
km_driven        0
fuel             0
owner            0
selling_price    0
dtype: int64

In [6]:
#finding number of unique values
df['brand'].nunique()

32

In [7]:
df['brand'].unique()

array(['Maruti', 'Skoda', 'Honda', 'Hyundai', 'Toyota', 'Ford', 'Renault',
       'Mahindra', 'Tata', 'Chevrolet', 'Fiat', 'Datsun', 'Jeep',
       'Mercedes-Benz', 'Mitsubishi', 'Audi', 'Volkswagen', 'BMW',
       'Nissan', 'Lexus', 'Jaguar', 'Land', 'MG', 'Volvo', 'Daewoo',
       'Kia', 'Force', 'Ambassador', 'Ashok', 'Isuzu', 'Opel', 'Peugeot'],
      dtype=object)

In [8]:
df['brand'].value_counts()

brand
Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: count, dtype: int64

In [9]:
df['fuel'].nunique()

4

In [10]:
df['fuel'].unique()

array(['Diesel', 'Petrol', 'LPG', 'CNG'], dtype=object)

In [11]:
df['fuel'].value_counts()

fuel
Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: count, dtype: int64

In [12]:
df['owner'].nunique()

5

In [13]:
df['owner'].unique()

array(['First Owner', 'Second Owner', 'Third Owner',
       'Fourth & Above Owner', 'Test Drive Car'], dtype=object)

In [14]:
df['owner'].value_counts()

owner
First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: count, dtype: int64

## One Hot Encoding using scikit learn

In [15]:
x= df.drop(columns=['selling_price'])
x.head()

Unnamed: 0,brand,km_driven,fuel,owner
0,Maruti,145500,Diesel,First Owner
1,Skoda,120000,Diesel,Second Owner
2,Honda,140000,Petrol,Third Owner
3,Hyundai,127000,Diesel,First Owner
4,Maruti,120000,Petrol,First Owner


In [16]:
y= df['selling_price']
y.head()

0    450000
1    370000
2    158000
3    225000
4    130000
Name: selling_price, dtype: int64

In [17]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=45)

In [18]:
x_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
5181,Renault,50000,Diesel,First Owner
2024,Tata,74000,Petrol,Second Owner
6203,Skoda,40000,Diesel,First Owner
7026,Tata,28000,Petrol,Second Owner
5837,Tata,5000,Petrol,First Owner


In [19]:
x_test.head()

Unnamed: 0,brand,km_driven,fuel,owner
1460,Maruti,25000,Petrol,First Owner
3293,Renault,40000,Petrol,First Owner
738,Renault,35000,Petrol,First Owner
1415,Volkswagen,70000,Diesel,Fourth & Above Owner
1600,Tata,100000,Diesel,First Owner


In [20]:
y_train.head()

5181    880000
2024     90000
6203    900000
7026    380000
5837    537000
Name: selling_price, dtype: int64

In [21]:
y_test.head()

1460    560000
3293    450000
738     300000
1415    229999
1600    850000
Name: selling_price, dtype: int64

In [22]:
from sklearn.preprocessing import OneHotEncoder

In [23]:
ohe = OneHotEncoder(drop='first', sparse_output=False, dtype=np.int32)
ohe

In [24]:
x_train_new = ohe.fit_transform(x_train[['fuel','owner']])
x_train_new

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0]])

In [25]:
x_test_new = ohe.transform(x_test[['fuel','owner']])
x_test_new

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 0, 0]])

In [26]:
x_train_new.shape

(6502, 7)

In [27]:
x_test_new.shape

(1626, 7)

In [28]:
x_train[['brand','km_driven']].values

array([['Renault', 50000],
       ['Tata', 74000],
       ['Skoda', 40000],
       ...,
       ['Hyundai', 76460],
       ['Volvo', 20000],
       ['Tata', 120000]], dtype=object)

In [29]:
np.hstack((x_train[['brand','km_driven']].values, x_train_new))

array([['Renault', 50000, 1, ..., 0, 0, 0],
       ['Tata', 74000, 0, ..., 1, 0, 0],
       ['Skoda', 40000, 1, ..., 0, 0, 0],
       ...,
       ['Hyundai', 76460, 1, ..., 0, 0, 0],
       ['Volvo', 20000, 1, ..., 0, 0, 0],
       ['Tata', 120000, 1, ..., 1, 0, 0]], dtype=object)

In [30]:
np.hstack((x_train[['brand','km_driven']].values, x_train_new)).shape

(6502, 9)

## One Hot Encoding With Top Categories

In [31]:
counts = df['brand'].value_counts()
counts

brand
Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: count, dtype: int64

In [32]:
df['brand'].nunique()
threshold = 100

In [33]:
uncommon = counts[counts <= threshold].index

In [34]:
uncommon

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Force', 'Land', 'Isuzu', 'Kia',
       'Ambassador', 'Daewoo', 'MG', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object', name='brand')

In [35]:
len(uncommon)

20

In [36]:
pd.get_dummies((df['brand']), dtype=int).sample(5)

Unnamed: 0,Ambassador,Ashok,Audi,BMW,Chevrolet,Daewoo,Datsun,Fiat,Force,Ford,...,Mitsubishi,Nissan,Opel,Peugeot,Renault,Skoda,Tata,Toyota,Volkswagen,Volvo
5379,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1931,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3721,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2661,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5046,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [37]:
brand_train = pd.get_dummies(x_train['brand'].replace(uncommon,'uncommon'), dtype=int)

In [38]:
brand_train.head()

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
5181,0,0,0,0,0,0,0,1,0,0,0,0,0
2024,0,0,0,0,0,0,0,0,0,1,0,0,0
6203,0,0,0,0,0,0,0,0,1,0,0,0,0
7026,0,0,0,0,0,0,0,0,0,1,0,0,0
5837,0,0,0,0,0,0,0,0,0,1,0,0,0


In [39]:
brand_train.shape

(6502, 13)

In [40]:
brand_test = pd.get_dummies(x_test['brand'].replace(uncommon,'uncommon'), dtype=int)
brand_test.head()

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
1460,0,0,0,0,0,0,1,0,0,0,0,0,0
3293,0,0,0,0,0,0,0,1,0,0,0,0,0
738,0,0,0,0,0,0,0,1,0,0,0,0,0
1415,0,0,0,0,0,0,0,0,0,0,0,1,0
1600,0,0,0,0,0,0,0,0,0,1,0,0,0


In [41]:
brand_test.shape

(1626, 13)

In [42]:
x_train_encoded = np.hstack((x_train[['km_driven']].values, x_train_new, brand_train))

In [43]:
x_train_encoded.shape

(6502, 21)

In [44]:
x_test_encoded = np.hstack((x_test[['km_driven']].values, x_test_new, brand_test))

In [45]:
x_test_encoded.shape

(1626, 21)

#### for more easy way, pass brand in x_test_new when fuel and owner were encoded

In [46]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [47]:
model.fit(x_train_encoded, y_train)

In [48]:
y_pred = model.predict(x_test_encoded)
y_pred

array([487556.30525973, 431677.06653636, 442132.0644478 , ...,
       307532.03943507, 383357.59456209, 738477.87821539])

In [49]:
pd.DataFrame({'y_test':y_test, 'y_predict':y_pred})

Unnamed: 0,y_test,y_predict
1460,560000,487556.305260
3293,450000,431677.066536
738,300000,442132.064448
1415,229999,385009.228867
1600,850000,468780.145418
...,...,...
2393,500000,537190.725070
5795,600000,799554.551733
1023,300000,307532.039435
4657,600000,383357.594562


In [50]:
from sklearn.metrics import mean_squared_error, r2_score
print('MSE:', mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2_Score:', r2_score(y_test, y_pred))

MSE: 288414673621.554
RMSE: 537042.5249657182
R2_Score: 0.5746046781775116
