In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv('cars.csv')

In [3]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [6]:
df['fuel'].nunique


<bound method IndexOpsMixin.nunique of 0       Diesel
1       Diesel
2       Petrol
3       Diesel
4       Petrol
         ...  
8123    Petrol
8124    Diesel
8125    Diesel
8126    Diesel
8127    Diesel
Name: fuel, Length: 8128, dtype: object>

In [17]:
df['owner'].value_counts()

owner
First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: count, dtype: int64

## 1.One Hot Encoding using Pandas

In [15]:
pd.get_dummies(df, columns=['fuel','owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


## 2. K minus 1 One hot Encoding

In [16]:
pd.get_dummies(df, columns=['fuel','owner'], drop_first = True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


#### this function of pandas doesnot remember
### can give different value when run again
#### there for we use scikit learn class known as one hot encoding

In [21]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [18]:
from sklearn.model_selection import train_test_split

In [22]:
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:,0:4],df.iloc[:,-1], test_size = 0.2, random_state = 0)

In [23]:
x_train

Unnamed: 0,brand,km_driven,fuel,owner
3042,Hyundai,60000,LPG,First Owner
1520,Tata,150000,Diesel,Third Owner
2611,Hyundai,110000,Diesel,Second Owner
3544,Mahindra,28000,Diesel,Second Owner
4138,Maruti,15000,Petrol,First Owner
...,...,...,...,...
4931,Tata,70000,Diesel,Third Owner
3264,Ford,100000,Diesel,Second Owner
1653,Hyundai,90000,Petrol,Second Owner
2607,Volkswagen,90000,Diesel,First Owner


In [24]:
from sklearn.preprocessing import OneHotEncoder

In [45]:
HOT = OneHotEncoder(drop = 'first')


# OneHotEncoder (sklearn.preprocessing)

`OneHotEncoder` is used to convert **categorical features** into a **numerical format**
that machine learning models can understand.  
It creates binary (0/1) columns for each category.

---

## Basic Usage

```python
from sklearn.preprocessing import OneHotEncoder

HOT = OneHotEncoder(drop='first')
````

---

## All Parameters & What They Do

### 1️⃣ `categories`

```python
categories='auto'
```

* Determines the categories per feature.
* `'auto'` → categories are inferred from the training data.
* You can also manually define them:

```python
categories=[['Toyota', 'Honda'], ['Petrol', 'Diesel']]
```

**Why it matters:**
Ensures consistent encoding when training & testing.

---

### 2️⃣ `drop`

```python
drop=None
```

* Controls whether to drop one category per feature.
* Common values:

  * `None` → keep all categories (default)
  * `'first'` → drop the first category
  * `'if_binary'` → drop one category only if feature has 2 categories

**Why it matters:**
Used to avoid the **dummy variable trap** (multicollinearity).

---

### 3️⃣ `sparse_output` (newer versions)

```python
sparse_output=True
```

* If `True` → returns a **sparse matrix**
* If `False` → returns a **dense NumPy array**

```python
OneHotEncoder(sparse_output=False)
```

**Why it matters:**
Dense arrays are easier to debug but use more memory.

---

### 4️⃣ `dtype`

```python
dtype=float
```

* Data type of the output array.
* Common options:

  * `float` (default)
  * `int`

```python
OneHotEncoder(dtype=int)
```

---

### 5️⃣ `handle_unknown`

```python
handle_unknown='error'
```

* What to do when **new unseen categories** appear in test data.

Options:

* `'error'` → raises an error (default)
* `'ignore'` → encodes unseen categories as all zeros

```python
OneHotEncoder(handle_unknown='ignore')
```

**Very important for real-world data.**

---

### 6️⃣ `min_frequency`

```python
min_frequency=None
```

* Groups rare categories together if they appear less than a threshold.
* Can be:

  * `int` → minimum count
  * `float` → minimum proportion

```python
OneHotEncoder(min_frequency=10)
```

---

### 7️⃣ `max_categories`

```python
max_categories=None
```

* Limits the number of output categories.
* Rare categories are grouped into `"infrequent"`.

```python
OneHotEncoder(max_categories=5)
```

---

### 8️⃣ `feature_name_combiner`

```python
feature_name_combiner='concat'
```

* Controls how output feature names are generated.

```python
feature_name_combiner=lambda feature, category: f"{feature}_{category}"
```

Used internally when calling:

```python
encoder.get_feature_names_out()
```

---

## Common Real-World Configuration

```python
OneHotEncoder(
    drop='first',
    handle_unknown='ignore',
    sparse_output=False
)
```

This setup:

* Avoids multicollinearity
* Prevents test-time crashes
* Produces readable NumPy arrays

---

## Key Takeaway

> OneHotEncoder doesn't just convert text → numbers.
> It controls **stability**, **memory usage**, and **model correctness**.

---

## Quick Reminder (Mental Model)

* Rows → samples
* Columns → categories
* More categories → more dimensions
* hstack / ColumnTransformer → combine with other features

```




In [46]:
x_train_new = HOT.fit_transform(x_train[['fuel','owner']]).toarray()

In [47]:
# one hot encoder procuces a sparse matrix by default
#9 cols because 4 from field 5 from owner

In [48]:
x_test_new = HOT.transform(x_test[['fuel','owner']]).toarray()

In [49]:
np.hstack((x_train[['brand','km_driven']].values, x_train_new)).shape

(6502, 9)

In [50]:
# We horizontally stack (column-wise) the original features with the newly
# transformed/encoded features to create ONE final feature matrix.
# ML models expect a single NumPy array as input, not separate feature sets.
# - 'brand' and 'km_driven' are kept as-is
# - 'x_train_new' contains engineered/encoded features
# hstack adds new columns while keeping the number of rows (samples) the same.
# Checking .shape is just a sanity check to confirm features were added correctly.


In [51]:
# Combine old features + new engineered features into one matrix.
# hstack sticks arrays side-by-side (column-wise).
# Rows = same samples, columns = more features.
# Required because ML models only accept ONE feature matrix.


##  4. One Hot Encoding with Top Categories

In [54]:
counts = df['brand'].value_counts()

In [58]:
df['brand'].nunique()
threshold = 100

In [59]:
repl = counts[counts <= threshold].index

In [60]:
pd.get_dummies(df['brand'].replace(repl,'uncommon'))

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
0,False,False,False,False,False,False,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,True,False,False,False,False
2,False,False,False,True,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,False,False,False,False,True,False,False,False,False,False,False,False,False
8124,False,False,False,False,True,False,False,False,False,False,False,False,False
8125,False,False,False,False,False,False,True,False,False,False,False,False,False
8126,False,False,False,False,False,False,False,False,False,True,False,False,False
