<a href="https://colab.research.google.com/github/PaulToronto/Math-and-Data-Science-Reference/blob/main/Scikit_learn_Scalers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Scikit-learn Scalers

In [1]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.datasets import load_iris

import pandas as pd

In [2]:
iris_bunch = load_iris()

In [3]:
species = dict(zip(range(len(iris_bunch.target_names)),
               iris_bunch.target_names))

In [4]:
iris = pd.DataFrame(iris_bunch.data, 
                    columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])
iris['species'] = iris_bunch.target
iris['species'] = iris['species'].map(species)
print(iris_bunch.target_names)
iris.head(2)

['setosa' 'versicolor' 'virginica']


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa


In [5]:
def print_attributes(scaler, series):
    print("min_:", scaler.min_,
          scaler.feature_range[0] - series.min() * scaler.scale_)
    print("scale_:", scaler.scale_, 
          (scaler.feature_range[1] - scaler.feature_range[0]) / (series.max() - series.min())) 
    print("data_min:", scaler.data_min_, 
          series.min())
    print("data_max:", scaler.data_max_, 
          series.max())
    print("data_range_:", scaler.data_range_, 
          series.max() - series.min())
    
    print("n_features_in_:", scaler.n_features_in_)
    print("n_samples_seen_:", scaler.n_samples_seen_)
    print("feature_names_in:", scaler.feature_names_in_)

## `MinMaxScaler`

`class sklearn.preprocessing.MinMaxScaler(feature_range=(0, 1), *, copy=True, clip=False)`

### Original Data

In [6]:
data = iris.drop(columns = ['sepal_width', 
                            'petal_length', 
                            'petal_width', 
                            'species'])
data.head(2)

Unnamed: 0,sepal_length
0,5.1
1,4.9


### Construct Scaler

In [7]:
scaler = MinMaxScaler()

### `.fit()`

- Data is not transformed at this stage

In [8]:
scaler.fit(data)

MinMaxScaler()

In [9]:
print_attributes(scaler, data['sepal_length'])

min_: [-1.19444444] [-1.19444444]
scale_: [0.27777778] 0.27777777777777773
data_min: [4.3] 4.3
data_max: [7.9] 7.9
data_range_: [3.6] 3.6000000000000005
n_features_in_: 1
n_samples_seen_: 150
feature_names_in: ['sepal_length']


### `.transform()`

In [10]:
data_scaled = scaler.transform(data)

In [11]:
data_scaled[0:2]

array([[0.22222222],
       [0.16666667]])

In [12]:
scaler.feature_range

(0, 1)

In [13]:
def my_scaler(X):
    min = data['sepal_length'].min()
    max = data['sepal_length'].max()
    X_std = (X - min) / (max - min)
    X_scaled = X_std * (scaler.feature_range[1] - scaler.feature_range[0]) + scaler.feature_range[0]
    return X_scaled

In [14]:
data_scaled = pd.DataFrame(data_scaled, columns=['scaled_' + data.columns[0]])
data_scaled['original_' + data.columns[0]] = data[data.columns[0]]
data_scaled['test_scale'] = data_scaled['original_' + data.columns[0]].apply(my_scaler)
data_scaled.head()

Unnamed: 0,scaled_sepal_length,original_sepal_length,test_scale
0,0.222222,5.1,0.222222
1,0.166667,4.9,0.166667
2,0.111111,4.7,0.111111
3,0.083333,4.6,0.083333
4,0.194444,5.0,0.194444


### Using the Scaler on New Data

In [15]:
scaler.transform([[4.8],[5.1], [6]])



array([[0.13888889],
       [0.22222222],
       [0.47222222]])

In [16]:
# notice that we get a value > 1 for 1000
#   - to prevent this, set clip=True in the MinMaxScaler constructor
scaler.transform([[4.8],[5.1], [6], [1000]]) 



array([[1.38888889e-01],
       [2.22222222e-01],
       [4.72222222e-01],
       [2.76583333e+02]])