## Transformaciones de datos

In [1]:
import pandas as pd
import numpy as np

In [2]:
inPath = 'https://raw.githubusercontent.com/tec03/Datasets/main/datasets/Cartwheeldata.csv'

In [4]:
df = pd.read_csv(inPath, 
                 index_col = 0
                )
df.index.name = None
df

Unnamed: 0,Age,Gender,GenderGroup,Glasses,GlassesGroup,Height,Wingspan,CWDistance,Complete,CompleteGroup,Score
1,56,F,1,Y,1,62.0,61.0,79,Y,1,7
2,26,F,1,Y,1,62.0,60.0,70,Y,1,8
3,33,F,1,Y,1,66.0,64.0,85,Y,1,0
4,39,F,1,N,0,64.0,63.0,87,Y,1,10
5,27,M,2,N,0,73.0,75.0,72,N,0,4
6,24,M,2,N,0,75.0,71.0,81,N,0,3
7,28,M,2,N,0,75.0,76.0,107,Y,1,10
8,22,F,1,N,0,65.0,62.0,98,Y,1,9
9,29,M,2,Y,1,74.0,73.0,106,N,0,5
10,33,F,1,Y,1,63.0,60.0,65,Y,1,8


In [7]:
df.Score.min()

0

In [8]:
df.Score.max()

10

In [9]:
varX = df.Score.head(10)
varX

1      7
2      8
3      0
4     10
5      4
6      3
7     10
8      9
9      5
10     8
Name: Score, dtype: int64

## 1. Normalización mín.-máx.

La normalización mín-máx es una de las formas populares de normalizar datos. Para cada conjunto de datos que se va a normalizar:

* El valor `mínimo` de ese conjunto de datos se transforma en 0, y
* El valor `máximo` se transforma en 1, y
* Todos los demás valores se transforman en un decimal `entre 0 y 1`
* En la mayoría de las situaciones, los datos se normalizan para ajustarse a un rango objetivo de [0, 1]
 $$ x_{i^{}}^{new}=\frac{x_{i^{}}^{old}-\min\left(A\right)}{\max\left(A\right)-\min\left(A\right)} $$

Esto puede ser útil cuando:

* Comparación de datos de dos escalas diferentes
* Conversión de datos a una nueva escala

All the data saved in `varX` will be converted to a value in `[0, 1]`

In [43]:
def normalizeZeroOne(x):
    min = np.min(x)
    max = np.max(x)
    range = max - min
    return [(a - min) / 
            range 
            for a in x
           ]

In [44]:
X = varX.copy()
normalizedValues = normalizeZeroOne(X)

print(normalizedValues)

[0.7, 0.8, 0.0, 1.0, 0.4, 0.3, 1.0, 0.9, 0.5, 0.8]


In [45]:
print(min(normalizedValues))
print(max(normalizedValues))

0.0
1.0


## 2. Normalizing from [0, 1] to any other range

Formula:

$$x_i^{new}=x_i^{old}\cdot\left(rango_{nuevo}\right)+ LimiteInferior_{nueva} $$

In [50]:
def normalizeNewRange(normalizedX, 
              nLoBound, 
              nUpBound
             ):
    range = nUpBound - nLoBound

    return [a * range + 
            nLoBound 
            for a in normalizedX
           ]

In [54]:
normalizedX = normalizedValues.copy()

Y = normalizeNewRange(normalizedX, 
              10, 20 #new range we are interested in
             )
print(Y)

[17.0, 18.0, 10.0, 20.0, 14.0, 13.0, 20.0, 19.0, 15.0, 18.0]


Here, we converted all values of normalizedX, which are in [0, 1], converted to a new range [10, 20]. 

## 3. Normalización de un rango a otro

$$x_i^{new}=\left(\frac{x_{i^{}}^{old}-\min\left(A\right)}{\max\left(A\right)-\min\left(A\right)}\right)\cdot\left(rango_{nuevo}\right)+LimiteInferior_{nueva}$$

In [55]:
def normalizeNewRange(X, nLoBound, nUpBound):
    min = np.min(X)
    max = np.max(X)
    range = max - min
    nRange = nUpBound - nLoBound

    return [((xi - min) / range) * nRange + 
            nLoBound 
            for xi in X
           ]

In [57]:
X = varX.copy()
Y = normalizeNewRange(X, 100, 200)

print(Y)

[170.0, 180.0, 100.0, 200.0, 140.0, 130.0, 200.0, 190.0, 150.0, 180.0]


## 4. Normalización de un df

Podemos usar el siguiente código para aplicar una normalización min-max a cada columna en el DataFrame:

In [69]:
ndf = df.head().copy()
ndf

Unnamed: 0,Age,Gender,GenderGroup,Glasses,GlassesGroup,Height,Wingspan,CWDistance,Complete,CompleteGroup,Score
1,56,F,1,Y,1,62.0,61.0,79,Y,1,7
2,26,F,1,Y,1,62.0,60.0,70,Y,1,8
3,33,F,1,Y,1,66.0,64.0,85,Y,1,0
4,39,F,1,N,0,64.0,63.0,87,Y,1,10
5,27,M,2,N,0,73.0,75.0,72,N,0,4


In [70]:
ndf.columns

Index(['Age', 'Gender', 'GenderGroup', 'Glasses', 'GlassesGroup', 'Height',
       'Wingspan', 'CWDistance', 'Complete', 'CompleteGroup', 'Score'],
      dtype='object')

In [71]:
ndf.drop(['Gender', 'Glasses', 'Complete'], 
         axis = 1, 
         inplace = True
        )
ndf

Unnamed: 0,Age,GenderGroup,GlassesGroup,Height,Wingspan,CWDistance,CompleteGroup,Score
1,56,1,1,62.0,61.0,79,1,7
2,26,1,1,62.0,60.0,70,1,8
3,33,1,1,66.0,64.0,85,1,0
4,39,1,0,64.0,63.0,87,1,10
5,27,2,0,73.0,75.0,72,0,4


In [75]:
ndf.max()

Age              56.0
GenderGroup       2.0
GlassesGroup      1.0
Height           73.0
Wingspan         75.0
CWDistance       87.0
CompleteGroup     1.0
Score            10.0
dtype: float64

In [76]:
ndf.min()

Age              26.0
GenderGroup       1.0
GlassesGroup      0.0
Height           62.0
Wingspan         60.0
CWDistance       70.0
CompleteGroup     0.0
Score             0.0
dtype: float64

In [78]:
ndfRange = ndf.max()-ndf.min()
ndfRange

Age              30.0
GenderGroup       1.0
GlassesGroup      1.0
Height           11.0
Wingspan         15.0
CWDistance       17.0
CompleteGroup     1.0
Score            10.0
dtype: float64

In [74]:
((ndf-ndf.min())/
    (ndfRange) 
).round(2).head()

Unnamed: 0,Age,GenderGroup,GlassesGroup,Height,Wingspan,CWDistance,CompleteGroup,Score
1,1.0,0.0,1.0,0.0,0.07,0.53,1.0,0.7
2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.8
3,0.23,0.0,1.0,0.36,0.27,0.88,1.0,0.0
4,0.43,0.0,0.0,0.18,0.2,1.0,1.0,1.0
5,0.03,1.0,0.0,1.0,1.0,0.12,0.0,0.4


## 5. Always there are alternatives....

In [80]:
from sklearn import preprocessing
#from sklearn.preprocessing import MinMaxScaler

### 5.1 Normalización mín.-máx.

In [103]:
X = df.Score.head(10).to_frame()
scaler = preprocessing.MinMaxScaler().fit(X)
#scaler

In [104]:
scaler.fit_transform(X).transpose()

array([[0.7, 0.8, 0. , 1. , 0.4, 0.3, 1. , 0.9, 0.5, 0.8]])

### 5.2. Standardization 

$$x_i^{new}=\left(\frac{x_{i^{}}^{old}-mean(X)}{\sigma}\right) $$

Standardization - Rescales a dataset to have a `mean 0` and a `standard deviation 1`. 

In [132]:
X = varX.copy().to_frame()
X#.Score.mean()#.transpose()

Unnamed: 0,Score
1,7
2,8
3,0
4,10
5,4
6,3
7,10
8,9
9,5
10,8


#### 5.2.1 Using scikit

In [137]:
scaler = preprocessing.StandardScaler().fit(X)
scaler

StandardScaler()

In [138]:
scaler.mean_ # takes the mean of each column

array([6.4])

In [147]:
k = X.Score.to_list()
k

[7, 8, 0, 10, 4, 3, 10, 9, 5, 8]

In [152]:
sts.pstdev(k)
#sts.stdev(k)

3.1368774282716245

In [169]:
X_scaled = scaler.transform(X) # x-mu/sigma = zscore
X_scaled = pd.DataFrame(X_scaled)
X_scaled

Unnamed: 0,0
0,0.191273
1,0.510061
2,-2.040245
3,1.147638
4,-0.765092
5,-1.08388
6,1.147638
7,0.82885
8,-0.446304
9,0.510061


In [170]:
X_scaled.mean()#(axis=0)  axis = 1 (axis='columns') -> -> -> operation -> -> ; axis =0  (axis='index') downward. 
#np.mean(X_scaled)

0   -1.221245e-16
dtype: float64

In [171]:
X_scaled.std(axis=0)

0    1.054093
dtype: float64

Scaled data has zero mean and unit variance

In [93]:
import statistics as sts

In [172]:
sts.pstdev(X_scaled[0]) #pstdev - Population standard deviaiton. 

1.0

#### 5.2.2 Using z-score  

$$x_i^{new}=\left(\frac{x_{i^{}}^{old}-mean(X)}{\sigma}\right) $$

In [181]:
import scipy.stats as stats
X_scaledB = stats.zscore(X)
X_scaledB

Unnamed: 0,Score
1,0.191273
2,0.510061
3,-2.040245
4,1.147638
5,-0.765092
6,-1.08388
7,1.147638
8,0.82885
9,-0.446304
10,0.510061


In [182]:
X_scaledB.Score.mean()
#np.mean(X_scaledB.Score)

-1.2212453270876723e-16

In [183]:
sts.pstdev(X_scaledB.Score)

1.0

#### 5.2.3 Using z-score   Calculating manually

$$x_i^{new}=\left(\frac{x_{i^{}}^{old}-mean(X)}{\sigma}\right) $$

In [184]:
xm = X.Score.mean()
xm

6.4

In [185]:
sd = sts.pstdev(X.Score) #pstdev - Population standard deviaiton. 
sd

3.1368774282716245

In [186]:
X_scaledC = (X-xm)/sd
X_scaledC

Unnamed: 0,Score
1,0.191273
2,0.510061
3,-2.040245
4,1.147638
5,-0.765092
6,-1.08388
7,1.147638
8,0.82885
9,-0.446304
10,0.510061


Scaled data has zero mean and unit variance

In [188]:
X_scaledC.var()

Score    1.111111
dtype: float64

In [189]:
X_scaledC.mean()

Score   -1.221245e-16
dtype: float64

In [192]:
sts.pstdev(X_scaledC.Score)

1.0

In [199]:
print('.'*55,'End','.'*55)

....................................................... End .......................................................
