In [1]:
# Ordinal Encoding 
# Has_covid ---> ['yes' , 'no'] ----> [0,1]

# 🔷 What is Ordinal Encoding?
Ordinal Encoding is a method of converting categorical (text) values into numbers, while keeping the order or ranking of the values.

Unlike Label Encoding (which just assigns numbers randomly), Ordinal Encoding is used when the categories have a natural order (like Low < Medium < High).

# 🧠 Why Use Ordinal Encoding?
Some categories in data have a clear order, but they are written as words.
Machine learning models cannot understand these text-based orders, so we convert them into numbers that reflect the order.

👉 This helps the model understand that some categories are higher/lower than others.

# ⚠️ When Not to Use Ordinal Encoding
Do not use Ordinal Encoding when:

The categories do not have a clear order (like Country, Color, Brand)

You are not sure about the rank of categories

In [1]:
import numpy as np 
import pandas as pd 

In [2]:
df = pd.read_csv("covid_toy.csv")

In [3]:
df.head(3) 

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No


In [4]:
df = df.dropna() 

In [5]:
df.head() 

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [6]:
df = df.drop(columns = ['age' , 'fever'])
df.head(3) 

Unnamed: 0,gender,cough,city,has_covid
0,Male,Mild,Kolkata,No
1,Male,Mild,Delhi,Yes
2,Male,Mild,Delhi,No


In [7]:
df['city'].value_counts()

city
Kolkata      29
Bangalore    28
Delhi        20
Mumbai       13
Name: count, dtype: int64

In [8]:
from sklearn.preprocessing import OrdinalEncoder 

In [13]:
oe = OrdinalEncoder(categories = [['Male','Female'],
                                 ['Mild','Strong'],
                                 ['Kolkata','Bangalore','Delhi','Mumbai'],
                                 ['No','Yes']]) 

In [14]:
df_sc = oe.fit_transform(df)

In [16]:
df_new = pd.DataFrame(df_sc , columns = df.columns) 
df_new


Unnamed: 0,gender,cough,city,has_covid
0,0.0,0.0,0.0,0.0
1,0.0,0.0,2.0,1.0
2,0.0,0.0,2.0,0.0
3,1.0,0.0,0.0,0.0
4,1.0,0.0,3.0,0.0
...,...,...,...,...
85,1.0,0.0,1.0,0.0
86,1.0,1.0,0.0,1.0
87,1.0,0.0,1.0,0.0
88,1.0,1.0,3.0,0.0


In [17]:
# (3). OneHotEncoder ------> 

# Column has_covid['yes' , 'no'] ---> new column (yes)

# 🔷 What is One-Hot Encoding?
One-Hot Encoding is a technique used to convert categorical (text) data into numbers — but instead of assigning a single number to each category, it creates separate binary columns (0 or 1) for each category.

# 🧠 Why Do We Use It?
Some categories don’t have any order — like "Red", "Green", "Blue".
If you use Label Encoding or Ordinal Encoding, the model might think:

Red > Blue > Green ❌ (which is not true)

So, One-Hot Encoding solves this problem by creating independent columns, and tells the model that:

Each category is equal and separate

There’s no rank or order

In [18]:
df = pd.read_csv("covid_toy.csv")

In [19]:
df.head(3) 

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No


In [20]:
df = df.dropna() 

In [21]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [22]:
from sklearn.preprocessing import OneHotEncoder 

In [23]:
ohe = OneHotEncoder(drop = 'first' , sparse_output = False , dtype = np.int32) 

In [24]:
df_new = ohe.fit_transform(df[['gender','cough','city','has_covid']]) 

In [26]:
 # df_new

# get_dummies

# 🔷 What is get_dummies()?
get_dummies() is a pandas function that automatically converts categorical (text) columns into numerical columns using One-Hot Encoding.

# 🧠 Why Do We Use It?
Machine learning models can't handle text, so we convert text like:

"Male", "Female"

"Red", "Blue", "Green"

"India", "USA", "UK"

into numbers in the form of binary columns (0 or 1).
That’s what get_dummies() does — automatically and easily.

In [27]:
df = pd.read_csv("covid_toy.csv")
df.head(3) 

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No


In [28]:
df = df.dropna() 

In [29]:
df.head(3) 

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No


In [30]:
pd.get_dummies(df , columns = ['gender','cough' , 'city' , 'has_covid'] , drop_first = True)  

Unnamed: 0,age,fever,gender_Male,cough_Strong,city_Delhi,city_Kolkata,city_Mumbai,has_covid_Yes
0,60,103.0,True,False,False,True,False,False
1,27,100.0,True,False,True,False,False,True
2,42,101.0,True,False,True,False,False,False
3,31,98.0,False,False,False,True,False,False
4,65,101.0,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...
95,12,104.0,False,False,False,False,False,False
96,51,101.0,False,True,False,True,False,True
97,20,101.0,False,False,False,False,False,False
98,5,98.0,False,True,False,False,True,False


In [None]:
df = pd.get_dummies(df , columns = ['gender','cough' , 'city' , 'has_covid'] , drop_first = True)  

In [None]:
df = df.astype(int) 

In [34]:
pd.get_dummies(df , columns = ['gender','cough' , 'city' , 'has_covid'])  

Unnamed: 0,age,fever,gender_Female,gender_Male,cough_Mild,cough_Strong,city_Bangalore,city_Delhi,city_Kolkata,city_Mumbai,has_covid_No,has_covid_Yes
0,60,103.0,0,1,1,0,0,0,1,0,1,0
1,27,100.0,0,1,1,0,0,1,0,0,0,1
2,42,101.0,0,1,1,0,0,1,0,0,1,0
3,31,98.0,1,0,1,0,0,0,1,0,1,0
4,65,101.0,1,0,1,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
95,12,104.0,1,0,1,0,1,0,0,0,1,0
96,51,101.0,1,0,0,1,0,0,1,0,0,1
97,20,101.0,1,0,1,0,1,0,0,0,1,0
98,5,98.0,1,0,0,1,0,0,0,1,1,0
