In [19]:
# Note: This notebook was created by Guntaas Kapoor (guntaaskapoor.bt23cseds@pec.edu.in)

## 1. Importing the necessary libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.preprocessing import MinMaxScaler, StandardScaler

## 2. Loading the dataset

In [2]:
# Load the dataset
data = load_wine()
df = pd.DataFrame(data.data, columns=data.feature_names)

In [3]:
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


## 3. Normalization (Min-Max Scaling)

In [10]:
# Normalization (Min-Max Scaling)
# Normalization (scales features to a 0-1 range).
# new_value = (old_value - min)/(max - min); where max and min are the maximum and minimum values of the dataset respectively.

minmax_scaler = MinMaxScaler()
df_normalized = pd.DataFrame(minmax_scaler.fit_transform(df), columns=df.columns)

In [11]:
df_normalized.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,0.842105,0.1917,0.572193,0.257732,0.619565,0.627586,0.57384,0.283019,0.59306,0.372014,0.455285,0.970696,0.561341
1,0.571053,0.205534,0.417112,0.030928,0.326087,0.575862,0.510549,0.245283,0.274448,0.264505,0.463415,0.78022,0.550642
2,0.560526,0.320158,0.700535,0.412371,0.336957,0.627586,0.611814,0.320755,0.757098,0.375427,0.447154,0.695971,0.646933
3,0.878947,0.23913,0.609626,0.319588,0.467391,0.989655,0.664557,0.207547,0.55836,0.556314,0.308943,0.798535,0.857347
4,0.581579,0.365613,0.807487,0.536082,0.521739,0.627586,0.495781,0.490566,0.444795,0.259386,0.455285,0.608059,0.325963


## 4. Standardization (Z-Score Scaling)

In [13]:
# Standardization (Z-score Scaling)
# Standardization (scales features to have mean 0 and standard deviation 1).
# new_value = (old_value - mean)/(standard deviation); where mean and standard deviation are the mean and standad deviation of the dataset.

standard_scaler = StandardScaler()
df_standardized = pd.DataFrame(standard_scaler.fit_transform(df), columns=df.columns)

In [9]:
df_standardized.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,1.518613,-0.56225,0.232053,-1.169593,1.913905,0.808997,1.034819,-0.659563,1.224884,0.251717,0.362177,1.84792,1.013009
1,0.24629,-0.499413,-0.827996,-2.490847,0.018145,0.568648,0.733629,-0.820719,-0.544721,-0.293321,0.406051,1.113449,0.965242
2,0.196879,0.021231,1.109334,-0.268738,0.088358,0.808997,1.215533,-0.498407,2.135968,0.26902,0.318304,0.788587,1.395148
3,1.69155,-0.346811,0.487926,-0.809251,0.930918,2.491446,1.466525,-0.981875,1.032155,1.186068,-0.427544,1.184071,2.334574
4,0.2957,0.227694,1.840403,0.451946,1.281985,0.808997,0.663351,0.226796,0.401404,-0.319276,0.362177,0.449601,-0.037874


## 5. Log Transformation

In [17]:
# Log Transformation (Adding 1 to avoid log(0))
# Log Transformation (applies log1p to stabilize variance).

df_log_transformed = np.log1p(df)

In [18]:
df_log_transformed.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,2.723267,0.996949,1.23256,2.809403,4.85203,1.335001,1.401183,0.24686,1.190888,1.893112,0.71295,1.593309,6.971669
1,2.653242,1.022451,1.144223,2.501436,4.615121,1.294727,1.324419,0.231112,0.824175,1.682688,0.71784,1.481605,6.957497
2,2.650421,1.211941,1.300192,2.97553,4.624973,1.335001,1.444563,0.262364,1.337629,1.899118,0.708036,1.427916,7.078342
3,2.732418,1.081805,1.252763,2.879198,4.736198,1.578979,1.501853,0.215111,1.156881,2.174752,0.620576,1.492904,7.300473
4,2.656055,1.278152,1.353255,3.091042,4.779123,1.335001,1.305626,0.329304,1.036737,1.671473,0.71295,1.368639,6.60123


# When to use which?

Normalization (Min-Max Scaling) ->
    Use when features have different scales but no strong outliers.
    Best for algorithms like KNN, Neural Networks, and Distance-based models (K-Means, SVM).
    
Standardization (Z-score Scaling) ->
    Use when data has different units or a Gaussian-like distribution.
    Preferred for models like Linear Regression, Logistic Regression, PCA, and SVM.
    
Log Transformation ->
    Use when data is skewed or has high variance.
    Helps with right-skewed distributions in Linear Regression, Decision Trees, and Clustering.