In [None]:
# Normalization ----> min_value = 0 , max_value = 1

# 🔷 What is Normalization?

Normalization is a data preprocessing technique used to rescale the values of numeric columns in a dataset to a specific range, usually 0 to 1.

It doesn’t change the shape of your data; it only compresses the values into a smaller range so that all features are comparable.

# 🧠 Why Normalization is Needed?

In machine learning, models work best when all features (columns) are on similar scales. If one feature has values in thousands and another has values in tens, the algorithm may focus more on larger numbers, even if they’re not more important.

# 📦 When Should You Use Normalization?

✅ Use Normalization when:

You are using machine learning models that are sensitive to feature scale and expect data to be bounded.

These include:

K-Nearest Neighbors (KNN)

Neural Networks

Logistic Regression

Gradient Descent-based models

Clustering algorithms like K-Means

❌ Avoid when:

You are using tree-based models like:

Decision Trees

Random Forest

XGBoost

Because these don’t care about the scale of t

# ✅ Key Benefits of Normalization

Makes training faster

Improves accuracy for some models

Prevents bias toward large values

Helps gradient-based methods (like neural nets) converge quickly

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("covid_toy.csv")

In [3]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [4]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [5]:
from sklearn.impute import SimpleImputer #for filling missing values

In [6]:
si = SimpleImputer()

In [7]:
df['fever'] = si.fit_transform(df[['fever']])

In [8]:
df.isnull().sum()

age          0
gender       0
fever        0
cough        0
city         0
has_covid    0
dtype: int64

In [9]:
df.head(3)

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No


In [10]:
df['city'].value_counts()

city
Kolkata      32
Bangalore    30
Delhi        22
Mumbai       16
Name: count, dtype: int64

In [11]:
df['gender']=df['gender'].map({'Female':0,'Male':1})
df['cough']=df['cough'].map({'Mild':0,'Strong':1})
df['city']=df['city'].map({'Kolkata':0,'Banglore':1,'Delhi':2,'Mumbai':3})
df['has_covid']=df['has_covid'].map({'No':1,'Yes':0})


In [12]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,1,103.0,0,0.0,1
1,27,1,100.0,0,2.0,0
2,42,1,101.0,0,2.0,1
3,31,0,98.0,0,0.0,1
4,65,0,101.0,0,3.0,1


In [13]:
x=df.drop(columns = ['has_covid'])
y=df['has_covid']

In [14]:
from sklearn.model_selection import train_test_split #data dividation

In [15]:
x_train , x_test , y_train , y_test = train_test_split(x,y,
                                                      test_size = 0.2,
                                                      random_state = 42)

In [16]:
np.round(x_train.describe() , 2) 

Unnamed: 0,age,gender,fever,cough,city
count,80.0,80.0,80.0,80.0,53.0
mean,42.91,0.41,100.98,0.4,1.4
std,24.47,0.5,1.93,0.49,1.29
min,5.0,0.0,98.0,0.0,0.0
25%,20.0,0.0,100.0,0.0,0.0
50%,42.0,0.0,101.0,0.0,2.0
75%,65.0,1.0,102.0,1.0,3.0
max,84.0,1.0,104.0,1.0,3.0


In [18]:
from sklearn.preprocessing import MinMaxScaler  

In [19]:
mn = MinMaxScaler()

In [20]:
x_train_mn = mn.fit_transform(x_train)

In [21]:
x_train_new = pd.DataFrame(x_train_mn,columns = x_train.columns )

In [22]:
np.round(x_train_new.describe() , 2)

Unnamed: 0,age,gender,fever,cough,city
count,80.0,80.0,80.0,80.0,53.0
mean,0.48,0.41,0.5,0.4,0.47
std,0.31,0.5,0.32,0.49,0.43
min,0.0,0.0,0.0,0.0,0.0
25%,0.19,0.0,0.33,0.0,0.0
50%,0.47,0.0,0.5,0.0,0.67
75%,0.76,1.0,0.67,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0
