In [1]:
# Standardization ----> Mean = 0, Standard Deviation = 1 ---->Mean Centroid -----> Normal Distribution

# 📌 What is Standardization?

Standardization is a method to change your data so that:

The mean (average) = 0

The standard deviation = 1

# 📊 Why do we do this?

Because machine learning models work better when all the features (columns) are on a similar scale.

# 📦 When to Use Standardization?

✅ Use it when:

You're using models that are sensitive to scale, like:

Linear Regression

Logistic Regression

K-Nearest Neighbors (KNN)

Support Vector Machine (SVM)

PCA (Principal Component Analysis)

❌ Don't need it for:

Tree-based models (like Decision Tree, Random Forest)



In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv("covid_toy.csv")

In [4]:
df.head() 

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [5]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

# Fill Missing Value

In [6]:
from sklearn.impute import SimpleImputer #for filling missing data by mean

In [7]:
si = SimpleImputer(strategy = 'mean')

In [8]:
df['fever'] = si.fit_transform(df[['fever']])

In [9]:
df.isnull().sum()

age          0
gender       0
fever        0
cough        0
city         0
has_covid    0
dtype: int64

In [10]:
df.head(3)

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No


In [11]:
df['gender'].value_counts()

gender
Female    59
Male      41
Name: count, dtype: int64

In [12]:
df['cough'].value_counts()

cough
Mild      62
Strong    38
Name: count, dtype: int64

In [13]:
df['city'].value_counts()

city
Kolkata      32
Bangalore    30
Delhi        22
Mumbai       16
Name: count, dtype: int64

In [14]:
df['has_covid'].value_counts()

has_covid
No     55
Yes    45
Name: count, dtype: int64

In [15]:
df['gender']=df['gender'].map({"Female":0,"Male":1})
df['cough']=df['cough'].map({"Mild":0,"Strong":1})
df['city']=df['city'].map({"Kolkata":1,"Banglore":2,"Delhi":2,"Mumbai":3})
df['has_covid']=df['has_covid'].map({"No":0,"Yes":1})

In [16]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,1,103.0,0,1.0,0
1,27,1,100.0,0,2.0,1
2,42,1,101.0,0,2.0,0
3,31,0,98.0,0,1.0,0
4,65,0,101.0,0,3.0,0


In [17]:
x = df.drop(columns = ['has_covid'])
y=df['has_covid']

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x,
                                                   y,
                                                   test_size = 0.2,
                                                   random_state = 42)

In [20]:
np.round(x_train.describe(), 2)

Unnamed: 0,age,gender,fever,cough,city
count,80.0,80.0,80.0,80.0,53.0
mean,42.91,0.41,100.98,0.4,1.83
std,24.47,0.5,1.93,0.49,0.83
min,5.0,0.0,98.0,0.0,1.0
25%,20.0,0.0,100.0,0.0,1.0
50%,42.0,0.0,101.0,0.0,2.0
75%,65.0,1.0,102.0,1.0,3.0
max,84.0,1.0,104.0,1.0,3.0


In [62]:
from sklearn.preprocessing import StandardScaler #for 0,1

In [63]:
sc = StandardScaler()

In [64]:
x_train_sc = sc.fit_transform(x_train) ###fit means learn the parameters and transform means apply on the data.

In [65]:
x_train_new = pd.DataFrame(x_train_sc, columns = x_train.columns)

In [66]:
np.round(x_train_new.describe(), 2)

Unnamed: 0,age,gender,fever,cough,city
count,80.0,80.0,80.0,80.0,53.0
mean,-0.0,0.0,-0.0,-0.0,-0.0
std,1.01,1.01,1.01,1.01,1.01
min,-1.56,-0.84,-1.56,-0.82,-1.01
25%,-0.94,-0.84,-0.51,-0.82,-1.01
50%,-0.04,-0.84,0.01,-0.82,0.21
75%,0.91,1.19,0.53,1.22,1.43
max,1.69,1.19,1.57,1.22,1.43
