In [33]:
import pandas as pd 
import numpy as np 

In [34]:
df = pd.read_csv('cars.csv')
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [3]:
df['owner'].value_counts()

owner
First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: count, dtype: int64

In [4]:
df['fuel'].value_counts()

fuel
Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: count, dtype: int64

In [5]:
df['brand'].value_counts()

brand
Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: count, dtype: int64

In [6]:
df['brand'].nunique()

32

In [7]:
df.shape

(8128, 5)

## onehotencoding using pandas 

In [8]:
pd.get_dummies(df,columns=['fuel','owner'],dtype=np.int32)

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0
8125,Maruti,120000,382000,0,1,0,0,1,0,0,0,0
8126,Tata,25000,290000,0,1,0,0,1,0,0,0,0


## k-1 OneHotEncoding

In [9]:
pd.get_dummies(df,columns=['fuel','owner'],drop_first=True,dtype=np.int32)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


## OneHotEncoder using sklearn

In [10]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test= train_test_split(df.iloc[:,0:4],df.iloc[:,-1],test_size=0.2,random_state=1)

In [11]:
from sklearn.preprocessing import OneHotEncoder

## OneHotEncoding with top categories 

In [12]:
x_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
1939,Honda,100000,Diesel,Second Owner
5455,Maruti,120000,Diesel,Second Owner
7079,Renault,100000,Diesel,Second Owner
2000,Honda,80000,Petrol,Second Owner
1578,Honda,56494,Petrol,First Owner


In [13]:
y_train

1939    420000
5455    675000
7079    570000
2000    350000
1578    550000
         ...  
7935     40000
5192    580000
3980    190000
235     250000
5157    480000
Name: selling_price, Length: 6502, dtype: int64

In [14]:
from sklearn.preprocessing import OneHotEncoder

In [15]:
# The parameter drop='first' tells OneHotEncoder to drop the first category of each feature when encoding. This is used to avoid the dummy variable trap — a situation where one variable can be predicted from the others, leading to multicollinearity in linear models like linear or logistic regression.

In [16]:
ohe = OneHotEncoder(drop='first',dtype=np.int32)

In [17]:
x_train_new =  ohe.fit_transform(x_train[['fuel','owner']]).toarray()

In [18]:
x_test_new = ohe.transform(x_test[['fuel','owner']]).toarray()

In [19]:
x_train[['brand','km_driven']].values

array([['Honda', 100000],
       ['Maruti', 120000],
       ['Renault', 100000],
       ...,
       ['Hyundai', 70000],
       ['Maruti', 110000],
       ['Maruti', 65755]], dtype=object)

In [20]:
x_train_new.shape

(6502, 7)

In [32]:
# np.hstack() stands for NumPy Horizontal Stack.
# It joins arrays side by side (column-wise).
combined = np.hstack((x_train[['brand', 'km_driven']].values, x_train_new))

In [22]:
print(combined)

[['Honda' 100000 1 ... 1 0 0]
 ['Maruti' 120000 1 ... 1 0 0]
 ['Renault' 100000 1 ... 1 0 0]
 ...
 ['Hyundai' 70000 0 ... 0 0 0]
 ['Maruti' 110000 1 ... 1 0 0]
 ['Maruti' 65755 0 ... 0 0 0]]


In [23]:
combined.shape

(6502, 9)

## OneHotEncoding with top categories 

In [24]:
df['brand'].value_counts()

brand
Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: count, dtype: int64

In [25]:
counts = df['brand'].value_counts()

In [26]:
df['brand'].nunique()

32

In [27]:
# threshold = 100 means:
# You're setting a limit or cutoff value for how many unique brands you're willing to work with.

df['brand'].nunique()
threshold= 100


In [28]:
# This line finds which categories (e.g., brands) occur less than or equal to threshold times,
# and stores their names (index) in the variable repl.

#This line is selecting the labels (index values) from a Pandas Series counts where the corresponding values are less 
# than or equal to a given threshold.
repl = counts[counts<=threshold].index

In [29]:
repl

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Force', 'Land', 'Isuzu', 'Kia',
       'Ambassador', 'Daewoo', 'MG', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object', name='brand')

In [30]:
#This line is doing one-hot encoding on the 'brand' column after replacing repl values with 'uncommon'.


pd.get_dummies(df['brand'].replace(repl,'uncommon'),dtype=np.int32)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,0,0,0,0,1,0,0,0,0,0,0,0,0
8124,0,0,0,0,1,0,0,0,0,0,0,0,0
8125,0,0,0,0,0,0,1,0,0,0,0,0,0
8126,0,0,0,0,0,0,0,0,0,1,0,0,0
