# Visualiser comportement de rescale, standardize, etc

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler, Binarizer
import pandas as pds
from pandas import read_csv

In [2]:
# Récupération du jeu de données et transformation de celui ci en dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/cpu-performance/machine.data'
names = ['constructor','Model','MYCT','MMIN','MMAX','CACH','CHMIN','CHMAX','PRP','ERP']
dataset = pds.read_csv(url, names = names)

In [3]:
dataset

Unnamed: 0,constructor,Model,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP
0,adviser,32/60,125,256,6000,256,16,128,198,199
1,amdahl,470v/7,29,8000,32000,32,8,32,269,253
2,amdahl,470v/7a,29,8000,32000,32,8,32,220,253
3,amdahl,470v/7b,29,8000,32000,32,8,32,172,253
4,amdahl,470v/7c,29,8000,16000,32,8,16,132,132
...,...,...,...,...,...,...,...,...,...,...
204,sperry,80/8,124,1000,8000,0,1,8,42,37
205,sperry,90/80-model-3,98,1000,8000,32,2,8,46,50
206,sratus,32,125,2000,8000,0,2,14,52,41
207,wang,vs-100,480,512,8000,32,0,0,67,47


# --------------------- SCALING ---------------------------

In [4]:
# MIN MAX SCALING
# PRP : Performance Relative Publiée
# ERP : Performance Relative Estimée
minmax_scale = MinMaxScaler().fit(dataset[['PRP', 'ERP']]) 
df_minmax = minmax_scale.transform(dataset[['PRP', 'ERP']]) 

In [5]:
df_minmax

array([[1.67832168e-01, 1.50449714e-01],
       [2.29895105e-01, 1.94603434e-01],
       [1.87062937e-01, 1.94603434e-01],
       [1.45104895e-01, 1.94603434e-01],
       [1.10139860e-01, 9.56663941e-02],
       [2.72727273e-01, 2.24856909e-01],
       [3.15559441e-01, 2.99264105e-01],
       [4.22202797e-01, 2.99264105e-01],
       [5.50699301e-01, 6.00163532e-01],
       [9.94755245e-01, 1.00000000e+00],
       [2.79720280e-02, 6.54129191e-03],
       [2.97202797e-02, 7.35895339e-03],
       [7.51748252e-02, 4.49713818e-02],
       [1.15384615e-01, 8.34014718e-02],
       [3.49650350e-03, 0.00000000e+00],
       [2.53496503e-02, 4.00654129e-02],
       [1.13636364e-02, 6.54129191e-03],
       [1.92307692e-02, 1.14472608e-02],
       [2.18531469e-02, 5.72363042e-03],
       [9.96503497e-02, 8.91251022e-02],
       [2.09790210e-02, 1.63532298e-02],
       [2.36013986e-02, 1.96238757e-02],
       [4.80769231e-02, 2.04415372e-02],
       [6.11888112e-02, 2.45298446e-02],
       [1.486013

In [6]:
print('\n********** Scaling*********\n')
print('Moyenne apres le Min max Scaling :\nPRP={:.2f}, ERP={:.2f}'
.format(df_minmax[:,0].mean(), df_minmax[:,1].mean()))
print('\n')
print('Valeur minimale et maximale pour la feature PRP apres min max scaling: \nMIN={:.2f}, MAX={:.2f}'
.format(df_minmax[:,0].min(), df_minmax[:,0].max()))
print('\n')
print('Valeur minimale et maximale pour la feature ERP apres min max scaling : \nMIN={:.2f}, MAX={:.2f}'
.format(df_minmax[:,1].min(), df_minmax[:,1].max()))


********** Scaling*********

Moyenne apres le Min max Scaling :
PRP=0.09, ERP=0.07


Valeur minimale et maximale pour la feature PRP apres min max scaling: 
MIN=0.00, MAX=1.00


Valeur minimale et maximale pour la feature ERP apres min max scaling : 
MIN=0.00, MAX=1.00


# --------------------- NORMALISATION ------------------

In [7]:
# Normalisation
nrm_scaler = Normalizer().fit(dataset[['PRP', 'ERP']])
df_nrm = nrm_scaler.transform(dataset[['PRP', 'ERP']])

In [8]:
df_nrm

array([[0.70532342, 0.70888566],
       [0.72843845, 0.68511125],
       [0.65617871, 0.75460552],
       [0.56222061, 0.82698729],
       [0.70710678, 0.70710678],
       [0.73888779, 0.67382849],
       [0.69375066, 0.72021526],
       [0.78883076, 0.61461047],
       [0.6472643 , 0.76226566],
       [0.67867424, 0.73443943],
       [0.85550008, 0.51780268],
       [0.85749293, 0.51449576],
       [0.79582869, 0.60552183],
       [0.76275696, 0.64668525],
       [0.5547002 , 0.83205029],
       [0.47981234, 0.87737114],
       [0.63688145, 0.77096175],
       [0.69459451, 0.71940146],
       [0.81550714, 0.57874701],
       [0.69542143, 0.71860214],
       [0.65079137, 0.7592566 ],
       [0.64594224, 0.76338629],
       [0.83624437, 0.54835696],
       [0.86047539, 0.50949201],
       [0.63473942, 0.77272625],
       [0.95667388, 0.29116162],
       [0.76250914, 0.64697745],
       [0.77523499, 0.63167295],
       [0.93979342, 0.34174306],
       [0.70710678, 0.70710678],
       [0.

In [9]:
print('\n********** Normalisation*********\n')
print('Moyenne apres la Normalisation :\nPRP={:.2f}, ERP={:.2f}'
.format(df_nrm[:,0].mean(), df_nrm[:,1].mean()))
print('\n')
print('Valeur minimale et maximale pour la feature PRP apres Normalisation: \nMIN={:.2f}, MAX={:.2f}'
.format(df_nrm[:,0].min(), df_nrm[:,0].max()))
print('\n')
print('Valeur minimale et maximale pour la feature ERP apres Normalisation : \nMIN={:.2f}, MAX={:.2f}'
.format(df_nrm[:,1].min(), df_nrm[:,1].max()))


********** Normalisation*********

Moyenne apres la Normalisation :
PRP=0.69, ERP=0.70


Valeur minimale et maximale pour la feature PRP apres Normalisation: 
MIN=0.30, MAX=0.96


Valeur minimale et maximale pour la feature ERP apres Normalisation : 
MIN=0.27, MAX=0.95


# --------------------- STANDARDISATION ----------------

In [10]:
# Z-Score standardisation
std_scaler = StandardScaler().fit(dataset[['PRP', 'ERP']])
df_std = std_scaler.transform(dataset[['PRP', 'ERP']])

In [11]:
df_std

array([[ 5.75759277e-01,  6.45586915e-01],
       [ 1.01827712e+00,  9.95358598e-01],
       [ 7.12877481e-01,  9.95358598e-01],
       [ 4.13710491e-01,  9.95358598e-01],
       [ 1.64404666e-01,  2.11610938e-01],
       [ 1.32367675e+00,  1.23501697e+00],
       [ 1.62907639e+00,  1.82444703e+00],
       [ 2.38945915e+00,  1.82444703e+00],
       [ 3.30565806e+00,  4.20807628e+00],
       [ 6.47184203e+00,  7.37545318e+00],
       [-4.21464022e-01, -4.94409681e-01],
       [-4.08998730e-01, -4.87932427e-01],
       [-8.49011583e-02, -1.89978772e-01],
       [ 2.01800540e-01,  1.14452138e-01],
       [-5.95978099e-01, -5.46227708e-01],
       [-4.40161958e-01, -2.28842292e-01],
       [-5.39884288e-01, -4.94409681e-01],
       [-4.83790478e-01, -4.55546160e-01],
       [-4.65092541e-01, -5.00886934e-01],
       [ 8.96129191e-02,  1.59792911e-01],
       [-4.71325187e-01, -4.16682640e-01],
       [-4.52627250e-01, -3.90773626e-01],
       [-2.78113172e-01, -3.84296373e-01],
       [-1.

In [12]:
print('\n********** Standardisation*********\n') 
print('Moyenne et Ecart type apres la standardisation de la feature PRP :\nMoyenne={:.2f}, Ecart Type={:.2f}'
.format(df_std[:,0].mean(), df_std[:,0].std()))
print('\n')
print('Moyenne et Ecart type apres la standardisation de la feature ERP :\nMoyenne={:.2f}, Ecart Type={:.2f}'
.format(df_std[:,1].mean(), df_std[:,1].std()))
print('\n')
print('Valeur minimale et maximale pour la feature PRP apres Standardisation: \nMIN={:.2f}, MAX={:.2f}'
.format(df_std[:,0].min(), df_std[:,0].max()))
print('\n')
print('Valeur minimal et maximal pour la feature ERP apres Standardisation : \nMIN={:.2f}, MAX={:.2f}'
.format(df_std[:,1].min(), df_std[:,1].max()))


********** Standardisation*********

Moyenne et Ecart type apres la standardisation de la feature PRP :
Moyenne=-0.00, Ecart Type=1.00


Moyenne et Ecart type apres la standardisation de la feature ERP :
Moyenne=0.00, Ecart Type=1.00


Valeur minimale et maximale pour la feature PRP apres Standardisation: 
MIN=-0.62, MAX=6.51


Valeur minimal et maximal pour la feature ERP apres Standardisation : 
MIN=-0.55, MAX=7.38


# ---------------------- BINARISATION -----------------

In [18]:
# Binarisation
# Test = [1,2,3,4,5,6,7,8,9,8,7,6,5,4,3,2,1]
# Train = [1,4,7,8,5,2,3,6,9,8,7,4,5,6,3,2,1]

bin_scaler = Binarizer(threshold=100).fit(dataset[['PRP', 'ERP']])
df_bin = bin_scaler.transform(dataset[['PRP', 'ERP']])

# bin_scaler = Binarizer(threshold=5).fit([Test,Train])
# df_bin = bin_scaler.transform([Test,Train])

In [19]:
df_bin

array([[1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [0, 0],
       [0, 0],
       [0, 0],
       [1, 1],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [1, 1],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [1, 1],
       [1, 1],
       [0, 0],
       [0, 0],
       [1, 1],
       [1, 1],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [1, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [1, 0],
       [1, 1],
       [1, 1],
       [0,

In [20]:
print('\n********** Binarisation*********\n')
print('Moyenne apres le Min max Scaling :\nPRP={:.2f}, ERP={:.2f}'
.format(df_bin[:,0].mean(), df_bin[:,1].mean()))
print('\n')
print('Valeur minimale et maximale pour la feature PRP apres Binarisation: \nMIN={:.2f}, MAX={:.2f}'
.format(df_bin[:,0].min(), df_bin[:,0].max()))
print('\n')
print('Valeur minimale et maximale pour la feature ERP apres Binarisation : \nMIN={:.2f}, MAX={:.2f}'
.format(df_bin[:,1].min(), df_bin[:,1].max()))


********** Binarisation*********

Moyenne apres le Min max Scaling :
PRP=0.27, ERP=0.25


Valeur minimale et maximale pour la feature PRP apres Binarisation: 
MIN=0.00, MAX=1.00


Valeur minimale et maximale pour la feature ERP apres Binarisation : 
MIN=0.00, MAX=1.00


# ---------------------- VISUALISATION -----------------

In [21]:
def plot():
    plt.figure(figsize=(8,6))
 
#     plt.scatter(dataset['PRP'], dataset['ERP'],
#             color='purple', label='donnees sans transformations', alpha=0.5, s = 100, marker = '.')
 
    plt.scatter(df_minmax[:,0], df_minmax[:,1],
            color='blue', label='min-max scaled [min=0, max=1]', alpha=0.3, s = 100, marker = '.')
     
    plt.scatter(df_nrm[:,0], df_nrm[:,1],
            color='green', label='Normalisation', alpha=0.3, s = 100, marker = '.')
    
    plt.scatter(df_std[:,0], df_std[:,1],
            color='orange', label='Standardisation', alpha=0.3, s = 100, marker = '.')
    
    plt.scatter(df_bin[:,0], df_bin[:,1],
            color='red', label='Binarisation', alpha=0.3, s = 100, marker = '.')
    
    plt.title('Plot des features MYCT et PRP avant et apres scaling')
    plt.xlabel('PRP')
    plt.ylabel('ERP')
    plt.legend(loc='upper right')
    plt.grid()
 
    plt.tight_layout()

In [22]:
%matplotlib notebook
plot()
plt.show()

<IPython.core.display.Javascript object>

Logistic Regression
Regression Analysis (polynomial, multivariate regression…)
Support Vector Machines (SVM)
K-Nearest Neighbors (KNN)
K-Means (clustering…)
Principal Component Analysis (PCA)