# Normalization methods

This notebook is intended to provide a brief introduction to normalization methods (minmax, z-score/standard score, mean normalization).

Normalization methods are commonly used to compare different datasets as they are rescaled to similar units that benefits the comparison.

## Minmax

### Definition
This is a type of scaling in with you can modify the dataset to be in certain limits (minimum and maximum).

### Formula / Procedure to find it

The default equation to perform this scaling (0,1) is as follows: $\displaystyle N_{i} = \frac{x_{i}-x_{min}}{x_{max}-x_{min}}$ <br>
Another variation consists in the addition in another values to the equation so the final dataset is scaled to the desired values: $\displaystyle N_{i} = (\frac{x_{i}-x_{min}}{x_{max}-x_{min}}+N_{min})*(N_{max}-N_{min})$

### Visualization

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots

### Creating the data set

In [None]:
data_set = [5,  2,  6, 10,  9,  3,  1,  9,  6,  2,  1,  4,  6,  5,  0,  4,  6,
            5,  4,  6,  3,  0,  2,  8,  6,  7,  8,  0,  4,  7,  0,  1,  0,  8,
            2,  0, 10,  2,  6,  6,  0,  5,  2,  0, 10,  3,  9,  8,  4,  7,  6,
            1, 10,  7, 10,  3,  0,  6,  5,  8,  4,  3,  7,  3,  1,  5,  3,  0,
            3,  3,  1,  2,  1,  5,  0,  5,  8,  1, 10,  7,  8,  6,  9,  3,  3,
            7,  3,  4, 10,  8,  2,  0,  0,  2,  9,  0,  5,  5,  5,  6]

All normalization methods can be visualized in 2D plots (Scatter) or boxplots.

In [None]:
def minmax_single(x, d_min, d_max, n_min, n_max):
    return (((x-d_min)/(d_max-d_min))*(n_max-n_min)+n_min)

def minmax(data, n_min=0, n_max=1):
    d_min = np.min(data)
    d_max = np.max(data)
    return np.array([minmax_single(x, d_min, d_max, n_min, n_max) for x in data])

In [None]:
minmax_data_set = minmax(data_set)

In [None]:
fig = go.Figure(data=go.Scatter(y=data_set, mode='markers+lines'))
fig.update_layout(height=600, width=1000, title_text=f"Distribution of data set")
fig.show()

In [None]:
fig = make_subplots(rows=1, cols=2)

fig.add_trace(
    go.Scatter(y=data_set,
               name="original data_set"),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=[0, 99], y=[np.min(data_set), np.min(data_set)],
               name="original min"),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=[0, 99], y=[np.max(data_set), np.max(data_set)],
               name="original max"),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(y=minmax_data_set,
               name="minmax data_set"),
    row=1, col=2
)

fig.add_trace(
    go.Scatter(x=[0, 99], y=[np.min(minmax_data_set), np.min(minmax_data_set)],
               name="minmax min"),
    row=1, col=2
)

fig.add_trace(
    go.Scatter(x=[0, 99], y=[np.max(minmax_data_set), np.max(minmax_data_set)],
               name="minmax max"),
    row=1, col=2
)

fig.update_layout(height=600, width=1000, title_text=f"Compare original vs minmax dataset")
fig.show()

In [None]:
plt.figure(figsize=(16,8))

plt.subplot(2,1,1)
plt.boxplot(data_set, 'h', vert=False)
plt.vlines(x=np.min(data_set), ymin=0.9, ymax=1.1, colors='r')
plt.vlines(x=np.max(data_set), ymin=0.9, ymax=1.1, colors='r')
plt.ylim([0.9,1.1])
plt.title(f"Boxplot of data set - Min: {np.min(data_set)} - Max: {np.max(data_set)}")

plt.subplot(2,1,2)
plt.boxplot(minmax_data_set, 'h', vert=False)
plt.vlines(x=np.min(minmax_data_set), ymin=0.9, ymax=1.1, colors='r')
plt.vlines(x=np.max(minmax_data_set), ymin=0.9, ymax=1.1, colors='r')
plt.ylim([0.9,1.1])
plt.title(f"Boxplot of data set - Min: {np.min(minmax_data_set)} - Max: {np.max(minmax_data_set)}")

plt.show()

## Z-Score / Standard Score

### Definition
Z-score normalization isn't just a scaling method. It also performs a transformation in the distribution of the data points.
That is due the inclusion of terms as Mean and Standard Deviation in the equation. <br>
Some features of z-score is that the final dataset has a Mean equals to 0 and a Standard Deviation of 1. This is particularly useful for machine learning applications.

### Formula / Procedure to find it

To obtain the z-score normalization you must perform the next equation: $\displaystyle Z_{i} = \frac{x_{i} - \mu}{\sigma}$ <br>
Being $\mu$: Mean <br>
&emsp;&emsp;&emsp;$\sigma$: Standard Deviation

### Creating the data set

We will be using the same dataset (previously created).

In [None]:
def zscore_single(x, d_mean, d_std):
    return (x-d_mean)/d_std

def zscore(data):
    d_mean = np.mean(data)
    d_std = np.std(data)
    return np.array([zscore_single(x, d_mean, d_std) for x in data])

In [None]:
zscore_data_set = zscore(data_set)

In [None]:
fig = go.Figure(data=go.Scatter(y=data_set, mode='markers+lines'))
fig.update_layout(height=600, width=1000, title_text=f"Distribution of data set")
fig.show()

In [None]:
fig = make_subplots(rows=1, cols=2)

fig.add_trace(
    go.Scatter(y=data_set,
               name="original data_set"),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=[0, 99], y=[np.min(data_set), np.min(data_set)],
               name="original min"),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=[0, 99], y=[np.max(data_set), np.max(data_set)],
               name="original max"),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(y=zscore_data_set,
               name="z-score data_set"),
    row=1, col=2
)

fig.add_trace(
    go.Scatter(x=[0, 99], y=[np.min(zscore_data_set), np.min(zscore_data_set)],
               name="z-score min"),
    row=1, col=2
)

fig.add_trace(
    go.Scatter(x=[0, 99], y=[np.max(zscore_data_set), np.max(zscore_data_set)],
               name="z-score max"),
    row=1, col=2
)

fig.add_trace(
    go.Scatter(x=[0, 99], y=[np.mean(zscore_data_set), np.mean(zscore_data_set)],
               name="z-score mean"),
    row=1, col=2
)

fig.update_layout(height=600, width=1000,
                  title_text=f"Z-score (Right): Mean: {np.round(np.mean(zscore_data_set),1)}: Std: {np.std(zscore_data_set)}")
fig.show()

Note: Z-score lets you observe the concept of MEAN and Standard Deviation easier. All the data points are around Zero (the mean) and distributed equaly around +/- 1 (standard deviation).

In [None]:
plt.figure(figsize=(16,8))
plt.subplot(2,1,1)
sns.distplot(data_set)
plt.vlines(x=np.mean(data_set), ymin=0, ymax=0.35, colors='r')
plt.vlines(x=np.median(data_set), ymin=0, ymax=0.35, colors='g')
plt.ylim([0,0.12])
plt.legend(["Mean", "Median","Distribution"])
plt.title(f"Distribution of data set")

plt.subplot(2,1,2)
sns.distplot(zscore_data_set)
plt.vlines(x=np.mean(zscore_data_set), ymin=0, ymax=0.35, colors='r')
plt.vlines(x=np.median(zscore_data_set), ymin=0, ymax=0.35, colors='g')
plt.ylim([0,0.35])
plt.legend(["Mean", "Median","Distribution"])
plt.title(f"Distribution of z-score data set")

plt.show()

## Mean normalization

### Definition
Variance is the expectation of the squared deviation of a random variable from its mean. Informally, it measures how far a set of numbers is spread out from their average value. Variance is important to get the normal distribution of a dataset. It is hard to use variance in most applications, so we use standard deviation. But variance has some advantages in the field of math theory. Also for the correlation of variables it is fundamental.

### Formula / Procedure to find it

To find the variance you must perform the next equation: $\displaystyle \sigma^2 = \frac{\sum\mid x - \bar{x}\mid^2}{n}$

We will be using the first data set (the one used in RANGE and STANDARD DEVIATION section).<br>

In [None]:
def mean_norm_single(x, d_mean, d_min, d_max):
    return (x-d_mean)/(d_max - d_min)

def mean_norm(data):
    d_mean = np.mean(data)
    d_min = np.min(data)
    d_max = np.max(data)
    return np.array([mean_norm_single(x, d_mean, d_min, d_max) for x in data])

In [None]:
mean_data_set = mean_norm(data_set)

In [None]:
fig = go.Figure(data=go.Scatter(y=data_set, mode='markers+lines'))
fig.update_layout(height=600, width=1000, title_text=f"Distribution of data set")
fig.show()

In [None]:
fig = make_subplots(rows=1, cols=2)

fig.add_trace(
    go.Scatter(y=data_set,
               name="original data_set"),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=[0, 99], y=[np.min(data_set), np.min(data_set)],
               name="original min"),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=[0, 99], y=[np.max(data_set), np.max(data_set)],
               name="original max"),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(y=mean_data_set,
               name="mean data_set"),
    row=1, col=2
)

fig.add_trace(
    go.Scatter(x=[0, 99], y=[np.min(mean_data_set), np.min(mean_data_set)],
               name="mean min"),
    row=1, col=2
)

fig.add_trace(
    go.Scatter(x=[0, 99], y=[np.max(mean_data_set), np.max(mean_data_set)],
               name="mean max"),
    row=1, col=2
)

fig.add_trace(
    go.Scatter(x=[0, 99], y=[np.mean(mean_data_set), np.mean(mean_data_set)],
               name="mean mean"),
    row=1, col=2
)

fig.update_layout(height=600, width=1000,
                  title_text=f"Mean Norm (Right): Mean: {np.round(np.mean(mean_data_set),1)}: Std: {np.std(mean_data_set)}")
fig.show()

In [None]:
plt.figure(figsize=(16,8))
plt.subplot(2,1,1)
sns.distplot(data_set)
plt.vlines(x=np.mean(data_set), ymin=0, ymax=0.35, colors='r')
plt.vlines(x=np.median(data_set), ymin=0, ymax=0.35, colors='g')
plt.ylim([0,0.12])
plt.legend(["Mean", "Median","Distribution"])
plt.title(f"Distribution of data set")

plt.subplot(2,1,2)
sns.distplot(mean_data_set)
plt.vlines(x=np.mean(mean_data_set), ymin=0, ymax=1.6, colors='r')
plt.vlines(x=np.median(mean_data_set), ymin=0, ymax=1.6, colors='g')
plt.ylim([0,1.6])
plt.legend(["Mean", "Median","Distribution"])
plt.title(f"Distribution of mean data set")

plt.show()

In the case of mean norm, the resulting plot for distribution of probabilities ends up different. This may be due to the number of bins but looking at the actual distribution area they are actually the same.

## Excercise

You can test your learning of the introduction of central tendency measures next:

In [None]:
class test:
    def __init__(self):
        self.questions = list()
        self.answers = list()
        self.correct_answers = 0
        self.score = 0

    def add_element(self, q, a):
        self.questions.append(q)
        self.answers.append(a)

    def remove_element(self, index):
        self.questions.pop(index)
        self.answers.pop(index)
        
    def show_answer(self, index):
        print(f"Q{index}: {self.questions[index-1]} - Ans_{index}: {self.answers[index-1]}")
    
    def show_answers(self):
        for index, (q, a) in enumerate(zip(self.questions, self.answers)):
            print(f"Q{index+1}: {q} - Ans_{index+1}: {a}")
    
    def build_from_csv(self, filename):
        df = pd.read_csv(filename)
        for index in range(df.shape[0]):
            self.add_element(df['Questions'][index], df['Answers'][index])
    
    def visualize_score(self):
        fig = go.Figure(data=[go.Pie(labels=["Correct", "Incorrect"],
                                     values=[self.score, 100-self.score],
                                     marker_colors=['rgb(10,100,10)', 'rgb(230,70,70)'],
                                     hole=.3)])
        fig.show()

    def test(self):
        self.correct_answers = 0
        for index, (q, a) in enumerate(zip(self.questions, self.answers)):
            current_answer = ''
            while len(str(current_answer))==0:
                current_answer = input(f"Q{index+1}: " + q)
                if len(current_answer)>0:
                    current_answer = np.round(float(current_answer),2)
                    self.correct_answers += int(current_answer == a)
                    if a==current_answer:
                        print("Correct")
                    else:
                        print("Incorrect")
        self.score =  100*np.sum(self.correct_answers)/len(self.questions)
        
        print(f"Your score: {self.score}")
        self.visualize_score()

In [None]:
exam = test()
exam.build_from_csv("https://raw.githubusercontent.com/Ricardo-DG/data_analytics_training/main/normalization_test.csv")

In [None]:
# If you would like to see the answers uncomment and run the following line

# exam.show_answers()

In [None]:
# If you would like to see a specific answer uncomment and run the following line
# (make sure to replace "index" with the number of the question you want to know the answer).

# exam.show_answer(index)

In [None]:
score = exam.test()