In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('cars.csv')
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


# 1. OneHotEncoding using Pandas

In [6]:
pd.get_dummies(df,columns=['fuel','owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


# 2. K-1 OneHotEncoding

In [10]:
pd.get_dummies(df,columns=['fuel','owner'],drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


# 3. OneHotEncoding using sklearn

In [13]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,0:4],df.iloc[:,-1],
                                                   test_size=0.2,random_state=42)

In [17]:
from sklearn.preprocessing import OneHotEncoder

In [39]:
ohe = OneHotEncoder(drop='first')

In [41]:
X_train_new = ohe.fit_transform(X_train[['fuel','owner']]).toarray()

In [43]:
X_test_new = ohe.transform(X_test[['fuel','owner']]).toarray()

In [48]:
X_train_new.shape

(6502, 7)

In [45]:
np.hstack((X_train[['brand','km_driven']].values,X_train_new)).shape

(6502, 9)

### Simple Example

In [57]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Step 1: Create DataFrame
df = pd.DataFrame({'Color': ['Red', 'Blue', 'Green', 'Blue', 'Red']})
print("Original Data:")
print(df)

# Step 2: Create OneHotEncoder object
encoder = OneHotEncoder(sparse_output=False)  # sparse_output=False gives readable DataFrame

# Step 3: Fit and transform
encoded = encoder.fit_transform(df[['Color']])

# Step 4: Create a new DataFrame with encoded columns
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(['Color']))

# Step 5: Combine with original data
final_df = pd.concat([df, encoded_df], axis=1)

print("\nOne-Hot Encoded Data:")
print(final_df)


Original Data:
   Color
0    Red
1   Blue
2  Green
3   Blue
4    Red

One-Hot Encoded Data:
   Color  Color_Blue  Color_Green  Color_Red
0    Red         0.0          0.0        1.0
1   Blue         1.0          0.0        0.0
2  Green         0.0          1.0        0.0
3   Blue         1.0          0.0        0.0
4    Red         0.0          0.0        1.0


In [59]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Step 1: Create DataFrame
df = pd.DataFrame({
    'Name': ['Amit', 'Priya', 'Raj', 'Sneha'],
    'Education': ['Bachelor', 'Master', 'PhD', 'Bachelor'],
    'Gender': ['Male', 'Female', 'Male', 'Female'],
    'Experience': [3, 5, 7, 2],
    'Salary': [6.5, 9.0, 12.0, 5.5]
})

print("Original Data:")
print(df)

# Step 2: One-Hot Encode categorical columns (Education & Gender)
encoder = OneHotEncoder(sparse_output=False)
encoded = encoder.fit_transform(df[['Education', 'Gender']])

# Step 3: Create a new DataFrame with encoded columns
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(['Education', 'Gender']))

# Step 4: Combine encoded columns with numeric columns
final_df = pd.concat([df[['Experience', 'Salary']], encoded_df], axis=1)

print("\nEncoded DataFrame:")
print(final_df)


Original Data:
    Name Education  Gender  Experience  Salary
0   Amit  Bachelor    Male           3     6.5
1  Priya    Master  Female           5     9.0
2    Raj       PhD    Male           7    12.0
3  Sneha  Bachelor  Female           2     5.5

Encoded DataFrame:
   Experience  Salary  Education_Bachelor  Education_Master  Education_PhD  \
0           3     6.5                 1.0               0.0            0.0   
1           5     9.0                 0.0               1.0            0.0   
2           7    12.0                 0.0               0.0            1.0   
3           2     5.5                 1.0               0.0            0.0   

   Gender_Female  Gender_Male  
0            0.0          1.0  
1            1.0          0.0  
2            0.0          1.0  
3            1.0          0.0  
