# **Comparing model prediction using One Hot Encoding (Pandas) & Label Encoding (sklearn)! Which one has better accuracy?**

In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder

from sklearn.preprocessing import LabelEncoder

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
file_path = '/content/drive/MyDrive/Data /carprices.csv'

In [4]:
df = pd.read_csv(file_path)

# **One Hot Encoding - Dummy Variable (Pandas)**

**df_dum = dataframe with encoded variables from one hot encoding using dummy in pandas**

In [21]:
dummies = pd.get_dummies(df['Car Model'])

In [22]:
df_dum = pd.concat([df,dummies],axis='columns')
df_dum

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs),Audi A5,BMW X5,Mercedez Benz C class
0,BMW X5,69000,18000,6,False,True,False
1,BMW X5,35000,34000,3,False,True,False
2,BMW X5,57000,26100,5,False,True,False
3,BMW X5,22500,40000,2,False,True,False
4,BMW X5,46000,31500,4,False,True,False
5,Audi A5,59000,29400,5,True,False,False
6,Audi A5,52000,32000,5,True,False,False
7,Audi A5,72000,19300,6,True,False,False
8,Audi A5,91000,12000,8,True,False,False
9,Mercedez Benz C class,67000,22000,6,False,False,True


In [23]:
df_dum = df_dum.drop(['Car Model'],axis='columns')
df_dum

Unnamed: 0,Mileage,Sell Price($),Age(yrs),Audi A5,BMW X5,Mercedez Benz C class
0,69000,18000,6,False,True,False
1,35000,34000,3,False,True,False
2,57000,26100,5,False,True,False
3,22500,40000,2,False,True,False
4,46000,31500,4,False,True,False
5,59000,29400,5,True,False,False
6,52000,32000,5,True,False,False
7,72000,19300,6,True,False,False
8,91000,12000,8,True,False,False
9,67000,22000,6,False,False,True


In [25]:
X_dum = df_dum.drop(['Sell Price($)'], axis='columns')
Y_dum = df_dum['Sell Price($)']

In [26]:
X_train_dum, X_test_dum, Y_train_dum, Y_test_dum = train_test_split(X_dum,Y_dum,test_size=0.2)

In [27]:
model_dum = LinearRegression()
model_dum.fit(X_train_dum,Y_train_dum)

In [28]:
#predicting values
#45,000 - mileage
#Age - 4
#1 - Benz


model_dum.predict([[45000,4,0,0,1]])

array([37980.00156055])

In [29]:
#predicting values
#86,000 mileage
#Age - 7
#1 - BMW


model_dum.predict([[22000,7,0,1,0]])

array([32752.77777778])

In [30]:
#shows the accuracy of the predicting
#94% accurate
#best score is 1

model_dum.score(X_test_dum,Y_test_dum)

0.02861883100465623

# **One Hot Encoding - Sklearn**

**df_on has dataframe from endcoded variables from one hot encoding sklearn**

In [14]:
on = OneHotEncoder(sparse_output=False,drop='if_binary')

In [15]:
encoded_features = on.fit_transform(df[['Car Model']])
encoded_df = pd.DataFrame(encoded_features, columns=on.get_feature_names_out(['Car Model']))

In [16]:
df_on = pd.concat([df, encoded_df], axis=1)
df_on = df_on.drop(['Car Model'], axis=1)
df_on

Unnamed: 0,Mileage,Sell Price($),Age(yrs),Car Model_Audi A5,Car Model_BMW X5,Car Model_Mercedez Benz C class
0,69000,18000,6,0.0,1.0,0.0
1,35000,34000,3,0.0,1.0,0.0
2,57000,26100,5,0.0,1.0,0.0
3,22500,40000,2,0.0,1.0,0.0
4,46000,31500,4,0.0,1.0,0.0
5,59000,29400,5,1.0,0.0,0.0
6,52000,32000,5,1.0,0.0,0.0
7,72000,19300,6,1.0,0.0,0.0
8,91000,12000,8,1.0,0.0,0.0
9,67000,22000,6,0.0,0.0,1.0


In [17]:
X_on = df_on.drop(['Sell Price($)'], axis='columns')
Y_on = df_on['Sell Price($)']
X_train_on, X_test_on, Y_train_on, Y_test_on = train_test_split(X_on,Y_on,test_size=0.3)

In [18]:
model_on = LinearRegression()
model_on.fit(X_train_on,Y_train_on)

In [19]:
model_on.predict([[45000,4,0,0,1]])

array([37295.89428976])

In [20]:
model_on.score(X_test_on,Y_test_on)

0.98162106884773

# **Label Encoding** **- SkLearn**

**df_lb stores label encoded values**

In [31]:
lb = LabelEncoder()

In [32]:
df_lb = df.copy()
df_lb['model'] = lb.fit_transform(df['Car Model'])
df_lb = df_lb.drop(['Car Model'],axis='columns')
df_lb

Unnamed: 0,Mileage,Sell Price($),Age(yrs),model
0,69000,18000,6,1
1,35000,34000,3,1
2,57000,26100,5,1
3,22500,40000,2,1
4,46000,31500,4,1
5,59000,29400,5,0
6,52000,32000,5,0
7,72000,19300,6,0
8,91000,12000,8,0
9,67000,22000,6,2


In [None]:
X_lb = df_lb[['Mileage',"Age(yrs)",'model']].values
Y_lb = df_lb['Sell Price($)']
X_train_lb, X_test_lb, Y_train_lb, Y_test_lb = train_test_split(X_lb,Y_lb,test_size=0.2)

In [None]:
model_lb = LinearRegression()
model_lb.fit(X_train_lb,Y_train_lb)

In [None]:
#predicting Benz | 45,0000 miles | 4 yr old

model_lb.predict([[45000,4,2]])

array([32343.95102408])

In [None]:
#predicting BMW | 86000 | 7 yr old

model_lb.predict([[86000,7,1]])

array([14925.12161894])

In [None]:
model_lb.score(X_test_lb,Y_test_lb)

0.9402438434592009