# CPSC529: 04_DataPreparation_1_Normalization

In [2]:
import pandas as pd
import scipy
import numpy

# to display nice model diagram
from sklearn import set_config
set_config(display='diagram')

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
import pandas as pd
import numpy as np

class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)

## 0. Data Loading

In [5]:
# separate array into input and output components
X = numpy.array([192,197,192,182,206,192,190,178,196,201], float).reshape(-1,1)
print(X.shape)
#numpy.set_printoptions(precision=0)
numpy.set_printoptions(suppress=True) 

print ("Original data")
print(X, "\n")	# read the frist rows

(10, 1)
Original data
[[192.]
 [197.]
 [192.]
 [182.]
 [206.]
 [192.]
 [190.]
 [178.]
 [196.]
 [201.]] 



## 1. Standardization and Range Normalization 

### 1.1 Range normalization [0, 1]

In [6]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)

# summarize transformed data
print ("Rescale data (between 0 and 1)")
print (scaler) 
print(rescaledX[:5], "\n")

# option2: fit and then transform
scaler2 = MinMaxScaler(feature_range=(0, 1))
scaler2.fit(X)
rescaledX2 = scaler2.transform(X)

# summarize transformed data
print ("Rescale data (between 0 and 1)")
print (scaler2) 
print(rescaledX2[:5], "\n")

Rescale data (between 0 and 1)
MinMaxScaler()
[[0.5       ]
 [0.67857143]
 [0.5       ]
 [0.14285714]
 [1.        ]] 

Rescale data (between 0 and 1)
MinMaxScaler()
[[0.5       ]
 [0.67857143]
 [0.5       ]
 [0.14285714]
 [1.        ]] 



In [7]:
MinMaxScaler

sklearn.preprocessing._data.MinMaxScaler

### 1.2 Range normalization [-1, 1]

In [8]:
scaler3 = MinMaxScaler(feature_range=(-1, 1))
rescaledX3 = scaler3.fit_transform(X)

# summarize transformed data
print ("Rescale data (between -1 and 1)")
print (scaler3)
print(rescaledX3[0:5], "\n") 

Rescale data (between -1 and 1)
MinMaxScaler(feature_range=(-1, 1))
[[ 0.        ]
 [ 0.35714286]
 [ 0.        ]
 [-0.71428571]
 [ 1.        ]] 



### 1.3 Standardization (0, 1)

In [9]:
################################################
# Standardize data (0 mean, 1 stdev)
################################################
from sklearn.preprocessing import StandardScaler

scaler4 = StandardScaler().fit(X)
rescaledX4 = scaler4.transform(X)

# summarize transformed data
print ("Standardize data (0 mean, 1 stdev)")
print (scaler4)
print(rescaledX4[0:5], "\n")

Standardize data (0 mean, 1 stdev)
StandardScaler()
[[-0.07654655]
 [ 0.5613414 ]
 [-0.07654655]
 [-1.35232246]
 [ 1.70953972]] 



### 1.4 Model Parameters and Attributes 

In [10]:
scaler4

In [11]:
# a dictionary of model parameters and attributes
vars(scaler4)

{'with_mean': True,
 'with_std': True,
 'copy': True,
 'n_features_in_': 1,
 'n_samples_seen_': 10,
 'mean_': array([192.6]),
 'var_': array([61.44]),
 'scale_': array([7.83836718])}

In [27]:
print(vars(scaler4).items(),"\n")
print(vars(scaler4).keys(),"\n")
print(scaler4.__dict__,"\n")

dict_items([('with_mean', True), ('with_std', True), ('copy', True), ('n_features_in_', 1), ('n_samples_seen_', 10), ('mean_', array([192.6])), ('var_', array([61.44])), ('scale_', array([7.838]))]) 

dict_keys(['with_mean', 'with_std', 'copy', 'n_features_in_', 'n_samples_seen_', 'mean_', 'var_', 'scale_']) 

{'with_mean': True, 'with_std': True, 'copy': True, 'n_features_in_': 1, 'n_samples_seen_': 10, 'mean_': array([192.6]), 'var_': array([61.44]), 'scale_': array([7.838])} 



In [13]:
# another way access model parameters and attributes
def get_properies_all(model):
    return [i for i in model.__dict__]
get_properies_all(scaler4)

['with_mean',
 'with_std',
 'copy',
 'n_features_in_',
 'n_samples_seen_',
 'mean_',
 'var_',
 'scale_']

In [14]:
# list of all attributes (**attributes end with _**)
def get_properies(model):
    return [i for i in model.__dict__ if i.endswith("_")]
get_properies(scaler4)

['n_features_in_', 'n_samples_seen_', 'mean_', 'var_', 'scale_']

In [15]:
print(get_properies(scaler3)) # minMaxScaler
print(get_properies(scaler4)) # standardScaler

['n_features_in_', 'n_samples_seen_', 'scale_', 'min_', 'data_min_', 'data_max_', 'data_range_']
['n_features_in_', 'n_samples_seen_', 'mean_', 'var_', 'scale_']


In [16]:
for key, val in vars(scaler4).items():
    if key.endswith("_"):
        print('{:>20s}:\t{}'.format(key,val))

      n_features_in_:	1
     n_samples_seen_:	10
               mean_:	[192.6]
                var_:	[61.44]
              scale_:	[7.83836718]


In [17]:
# double check with numpy solution
numpy.set_printoptions(precision=3)

# Basic statistics
print ("Min/Max/Mean/variance/STD of feature values")
print (numpy.min(X, axis=0)) # mins
print (numpy.max(X, axis=0)) # max 
print (numpy.mean(X, axis=0))# means 
print (numpy.var(X, axis=0)) # variance 
print (numpy.std(X, axis=0)) # std 
 

Min/Max/Mean/variance/STD of feature values
[178.]
[206.]
[192.6]
[61.44]
[7.838]


## 2. Discretization

https://www.journaldev.com/54363/data-discretization-python-sklearn

### 2.1 quantile transformation 

In [18]:
# Import the class
# ‘quantile’: All bins in each feature have the same number of points.

from sklearn.preprocessing import KBinsDiscretizer
 
#Discrete the data
transf = KBinsDiscretizer(n_bins = 3, encode = 'ordinal', strategy = 'quantile')
 
#fit transform 
X_q = transf.fit_transform(X)
print("Original: ", X.astype(int).tolist())
print("Quantile: ", X_q.astype(int).tolist())

Original:  [[192], [197], [192], [182], [206], [192], [190], [178], [196], [201]]
Quantile:  [[1], [2], [1], [0], [2], [1], [0], [0], [2], [2]]


### 2.2 uniform transformation 

In [19]:
# Import the class
# uniform’: All bins in each feature have identical widths.
from sklearn.preprocessing import KBinsDiscretizer

# Discrete the data
transf = KBinsDiscretizer(n_bins = 3, encode = 'ordinal', strategy = 'uniform')
 
# fit transform 
X_u = transf.fit_transform(X)
print("Original: ", X.astype(int).tolist())
print("Quantile: ", X_q.astype(int).tolist())
print("Uniform:  ", X_u.astype(int).tolist())

Original:  [[192], [197], [192], [182], [206], [192], [190], [178], [196], [201]]
Quantile:  [[1], [2], [1], [0], [2], [1], [0], [0], [2], [2]]
Uniform:   [[1], [2], [1], [0], [2], [1], [1], [0], [1], [2]]


### 2.3 Kmeans transformation 

In [20]:
#Import the class
#‘kmeans’: Values in each bin have the same nearest center of a 1D k-means cluster.
from sklearn.preprocessing import KBinsDiscretizer
 
#Discrete the data
transf = KBinsDiscretizer(n_bins = 3, encode = 'ordinal', strategy = 'kmeans')
 
#fit transform 
X_kmeans = transf.fit_transform(X)
print("Original: ", X.astype(int).tolist())
print("Quantile: ", X_q.astype(int).tolist())
print("Uniform:  ", X_u.astype(int).tolist())
print("Kmeans:   ",  X_kmeans.astype(int).tolist())

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fe1a36d3310>
Traceback (most recent call last):
  File "/home/dche/anaconda3/lib/python3.9/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/dche/anaconda3/lib/python3.9/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/dche/anaconda3/lib/python3.9/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/dche/anaconda3/lib/python3.9/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_modul

Original:  [[192], [197], [192], [182], [206], [192], [190], [178], [196], [201]]
Quantile:  [[1], [2], [1], [0], [2], [1], [0], [0], [2], [2]]
Uniform:   [[1], [2], [1], [0], [2], [1], [1], [0], [1], [2]]
Kmeans:    [[1], [2], [1], [0], [2], [1], [1], [0], [1], [2]]


## 3. Unit Normalization

In [26]:
#############################################
# Normalize data (length of 1)
#############################################
from sklearn.preprocessing import Normalizer
import numpy as np

U1 = np.array([[1,2,3]])
U2 = np.array([[2,4,6]])
U3 = np.array([[2,3,1]])

norm_u1 = Normalizer().fit_transform(U1)
norm_u2 = Normalizer().fit_transform(U2)
norm_u3 = Normalizer().fit_transform(U3)

# summarize transformed data
print ("Normalize data (length of 1)")
print(norm_u1)
print(norm_u2)
print(norm_u3)

Normalize data (length of 1)
[[0.267 0.535 0.802]]
[[0.267 0.535 0.802]]
[[0.535 0.802 0.267]]
