<a href="https://colab.research.google.com/github/PosgradoMNA/actividades-de-aprendizaje-MCoronaTec/blob/main/Normalization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Normalization

In [1]:
import pandas as pd
import numpy as np

In [2]:
inPath = 'https://raw.githubusercontent.com/tec03/Datasets/main/datasets/Cartwheeldata.csv'

In [3]:
df = pd.read_csv(inPath, index_col = 0)
df.index.name = None
df

Unnamed: 0,Age,Gender,GenderGroup,Glasses,GlassesGroup,Height,Wingspan,CWDistance,Complete,CompleteGroup,Score
1,56,F,1,Y,1,62.0,61.0,79,Y,1,7
2,26,F,1,Y,1,62.0,60.0,70,Y,1,8
3,33,F,1,Y,1,66.0,64.0,85,Y,1,0
4,39,F,1,N,0,64.0,63.0,87,Y,1,10
5,27,M,2,N,0,73.0,75.0,72,N,0,4
6,24,M,2,N,0,75.0,71.0,81,N,0,3
7,28,M,2,N,0,75.0,76.0,107,Y,1,10
8,22,F,1,N,0,65.0,62.0,98,Y,1,9
9,29,M,2,Y,1,74.0,73.0,106,N,0,5
10,33,F,1,Y,1,63.0,60.0,65,Y,1,8


In [4]:
min(df.Score.to_list())

0

In [5]:
varX = df.Score.head(10)
varX

1      7
2      8
3      0
4     10
5      4
6      3
7     10
8      9
9      5
10     8
Name: Score, dtype: int64

### 1. Normalization - rescales a dataset so that each value falls between 0 and 1.

Forumula: 

$$ x_i^{new} = \cfrac{x_i^{old} - x_{min}}{x_{max} - x_{min}}$$ 

In [6]:
def normalize(x):
    min = np.min(x)
    max = np.max(x)
    range = max - min
    return [(a - min) / range for a in x]

In [7]:
X = varX.copy()
normalizedValues = normalize(X)

print(normalizedValues)

[0.7, 0.8, 0.0, 1.0, 0.4, 0.3, 1.0, 0.9, 0.5, 0.8]


In [8]:
print(min(normalizedValues))
print(max(normalizedValues))

0.0
1.0


### 1.1 Normalizing from [0, 1] to any other range

$$x_i^{new}=x_i^{old}\cdot\left(rango_{nuevo}\right)+ LimiteInferior_{nueva} $$

In [9]:
def normalize(normalizedX, nLoBound, nUpBound):
    range = nUpBound - nLoBound

    return [a * range + nLoBound for a in normalizedX]

In [10]:
normalizedX = normalizedValues
Y = normalize(normalizedX, 10, 20)

print(Y)

[17.0, 18.0, 10.0, 20.0, 14.0, 13.0, 20.0, 19.0, 15.0, 18.0]


### 1.2 Normalización de un rango a otro

$$x_i^{new}=\left(\frac{x_{i^{}}^{old}-\min\left(A\right)}{\max\left(A\right)-\min\left(A\right)}\right)\cdot\left(rango_{nuevo}\right)+LimiteInferior_{nueva}$$

In [11]:
def normalize(X, nLoBound, nUpBound):
    min = np.min(X)
    max = np.max(X)
    range = max - min
    nRange = nUpBound - nLoBound

    return [((xi - min) / range) * nRange + nLoBound for xi in X]

In [12]:
X = varX.copy()
Y = normalize(X, 100, 200)

print(Y)

[170.0, 180.0, 100.0, 200.0, 140.0, 130.0, 200.0, 190.0, 150.0, 180.0]


Podemos usar el siguiente código para aplicar una normalización min-max a cada columna en el DataFrame:

In [13]:
ndf = df.copy()
ndf.head()

Unnamed: 0,Age,Gender,GenderGroup,Glasses,GlassesGroup,Height,Wingspan,CWDistance,Complete,CompleteGroup,Score
1,56,F,1,Y,1,62.0,61.0,79,Y,1,7
2,26,F,1,Y,1,62.0,60.0,70,Y,1,8
3,33,F,1,Y,1,66.0,64.0,85,Y,1,0
4,39,F,1,N,0,64.0,63.0,87,Y,1,10
5,27,M,2,N,0,73.0,75.0,72,N,0,4


In [14]:
ndf.columns

Index(['Age', 'Gender', 'GenderGroup', 'Glasses', 'GlassesGroup', 'Height',
       'Wingspan', 'CWDistance', 'Complete', 'CompleteGroup', 'Score'],
      dtype='object')

In [15]:
ndf.drop(['Gender', 'GenderGroup', 'Glasses', 'GlassesGroup', 'Complete'], 
         axis = 1, 
         inplace = True
        )
ndf

Unnamed: 0,Age,Height,Wingspan,CWDistance,CompleteGroup,Score
1,56,62.0,61.0,79,1,7
2,26,62.0,60.0,70,1,8
3,33,66.0,64.0,85,1,0
4,39,64.0,63.0,87,1,10
5,27,73.0,75.0,72,0,4
6,24,75.0,71.0,81,0,3
7,28,75.0,76.0,107,1,10
8,22,65.0,62.0,98,1,9
9,29,74.0,73.0,106,0,5
10,33,63.0,60.0,65,1,8


In [16]:
(
    (ndf-ndf.min())/
    (ndf.max()-ndf.min()) 
).head()

Unnamed: 0,Age,Height,Wingspan,CWDistance,CompleteGroup,Score
1,1.0,0.037037,0.189189,0.307692,1.0,0.7
2,0.117647,0.037037,0.135135,0.134615,1.0,0.8
3,0.323529,0.333333,0.351351,0.423077,1.0,0.0
4,0.5,0.185185,0.297297,0.461538,1.0,1.0
5,0.147059,0.851852,0.945946,0.173077,0.0,0.4


### 1.3 Always there are alternatives....

In [17]:
from sklearn import preprocessing
#from sklearn.preprocessing import MinMaxScaler

In [18]:
X = df.Score.to_frame()

scaler = preprocessing.MinMaxScaler().fit(X)
#scaler

In [19]:
scaler.fit_transform(X)

array([[0.7],
       [0.8],
       [0. ],
       [1. ],
       [0.4],
       [0.3],
       [1. ],
       [0.9],
       [0.5],
       [0.8],
       [0.6],
       [1. ],
       [0.6],
       [0.4],
       [0.9],
       [0.6],
       [1. ],
       [0.5],
       [0.3],
       [0.8],
       [0.2],
       [0.8],
       [0.4],
       [0.5],
       [0.3]])

### 2. Standardization - rescales a dataset to have a mean of 0 and a standard deviation of 1. 

In [20]:
X = varX.copy().to_frame()

In [30]:
scaler = preprocessing.StandardScaler().fit(X)
scaler

StandardScaler()

In [31]:
X_scaledA = scaler.transform(X)
X_scaledA = pd.DataFrame(X_scaledA)
X_scaledA

-1.2212453270876723e-16


Unnamed: 0,0
0,0.191273
1,0.510061
2,-2.040245
3,1.147638
4,-0.765092
5,-1.08388
6,1.147638
7,0.82885
8,-0.446304
9,0.510061


In [33]:
print(np.mean(X_scaledA))
print(np.std(X_scaledA))

0   -1.221245e-16
dtype: float64
0    1.0
dtype: float64


In [23]:
import scipy.stats as stats
X_scaledB = stats.zscore(X)
X_scaledB

Unnamed: 0,Score
1,0.191273
2,0.510061
3,-2.040245
4,1.147638
5,-0.765092
6,-1.08388
7,1.147638
8,0.82885
9,-0.446304
10,0.510061


In [24]:
import statistics as sts

In [25]:
sd = sts.pstdev(X.Score) #pstdev - Population standard deviaiton. 
sd

3.1368774282716245

In [26]:
xm = X.Score.mean()
xm

6.4

In [27]:
X_scaledC = (X-xm)/sd
X_scaledC

Unnamed: 0,Score
1,0.191273
2,0.510061
3,-2.040245
4,1.147638
5,-0.765092
6,-1.08388
7,1.147638
8,0.82885
9,-0.446304
10,0.510061


Scaled data has zero mean and unit variance: