In [1]:
# 2023 OCT 23

In [2]:
import numpy as np
import pandas as pd

# concept

In [4]:
X = np.array([20, 30, 40])

**standardization**

In [3]:
# -> normal distribution: N(0, 1)
def standardize(X):
    return (X - np.mean(X))/np.std(X)

In [5]:
print(standardize(X))

[-1.22474487  0.          1.22474487]


**normalization**

In [7]:
# -> [0, 1] for the implementation below
def normalize(X):
    return (X - np.min(X))/(np.max(X) - np.min(X))

In [8]:
print(normalize(X))

[0.  0.5 1. ]


# tool

In [10]:
from sklearn.datasets import load_iris

iris = load_iris()
iris_features = iris.data
iris_df = pd.DataFrame(data=iris_features, columns=iris.feature_names)

In [18]:
# mean, standard deviation
print("mean of each feature")
print(iris_df.mean(), "\n")

print("standard deviation of each feature")
print(iris_df.std(), "\n")

# min, max
print("min of each feature")
print(iris_df.min(), "\n")

print("max of each feature")
print(iris_df.max())

mean of each feature
sepal length (cm)    5.843333
sepal width (cm)     3.057333
petal length (cm)    3.758000
petal width (cm)     1.199333
dtype: float64 

standard deviation of each feature
sepal length (cm)    0.828066
sepal width (cm)     0.435866
petal length (cm)    1.765298
petal width (cm)     0.762238
dtype: float64 

min of each feature
sepal length (cm)    4.3
sepal width (cm)     2.0
petal length (cm)    1.0
petal width (cm)     0.1
dtype: float64 

max of each feature
sepal length (cm)    7.9
sepal width (cm)     4.4
petal length (cm)    6.9
petal width (cm)     2.5
dtype: float64


**StandardScaler**

In [16]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(iris_df)
iris_standardized = scaler.transform(iris_df)
# iris_standardized = scaler.fit_transform(iris_df)

# convert numpy array -> pandas DataFrame
iris_df_standardized = pd.DataFrame(data=iris_standardized, columns=iris.feature_names)

In [17]:
print("mean of each feature")
print(iris_df_standardized.mean(), "\n")

print("standard deviation of each feature")
print(iris_df_standardized.std())

mean of each feature
sepal length (cm)   -1.690315e-15
sepal width (cm)    -1.842970e-15
petal length (cm)   -1.698641e-15
petal width (cm)    -1.409243e-15
dtype: float64 

standard deviation of each feature
sepal length (cm)    1.00335
sepal width (cm)     1.00335
petal length (cm)    1.00335
petal width (cm)     1.00335
dtype: float64


**MinMaxScaler**

In [19]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaler.fit(iris_df)
iris_normalized = scaler.transform(iris_df)
# iris_normalized = scaler.fit_transform(iris_df)

# convert numpy array -> pandas DataFrame
iris_df_normalized = pd.DataFrame(data=iris_normalized, columns=iris.feature_names)

In [21]:
print("min of each feature")
print(iris_df_normalized.min(), "\n")

print("max of each feature")
print(iris_df_normalized.max())

min of each feature
sepal length (cm)    0.0
sepal width (cm)     0.0
petal length (cm)    0.0
petal width (cm)     0.0
dtype: float64 

max of each feature
sepal length (cm)    1.0
sepal width (cm)     1.0
petal length (cm)    1.0
petal width (cm)     1.0
dtype: float64


# <!> WARNING <!>

***<!> scale used in training must be the same as the scale used in prediction <!>***

In [22]:
# Ex)
X_train = np.arange(0, 11).reshape(-1, 1)
X_test = np.arange(0, 6).reshape(-1, 1)
# reshape <R> function only takes in >= dim(2) arrays

In [27]:
scaler = MinMaxScaler()

In [29]:
scaler.fit(X_train)  # (0, 10) ~> registered to ~> (0, 1)

X_train_scaled = scaler.transform(X_train)  # scaling applied

print("X_train_scaled:", X_train_scaled.ravel())

X_train_scaled: [0.  0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. ]


In [30]:
# scaler.fit(X_test)  # <!> NO! YOU DON'T DO THIS: must keep the same scale <!>

X_test_scaled = scaler.transform(X_test)  # scaling applied

print("X_test_scaled:", X_test_scaled.ravel())

X_test_scaled: [0.  0.1 0.2 0.3 0.4 0.5]
