# Libraries

In [403]:
from sklearn import datasets as data # Sample datasets
from sklearn import svm # SVM Classifier
from sklearn.naive_bayes import GaussianNB # Naive Bayes Classifier
from sklearn.neural_network import MLPClassifier # MLP Classifier
from sklearn.neighbors import KNeighborsClassifier # K-Neighbors Classifier
from sklearn.neighbors import NearestCentroid # Nearest Centroid Classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import cross_val_score # Cross Validation
import numpy as np
import math
import pandas as pd
from sklearn.preprocessing import Normalizer

# Custom functions

In [404]:
def test_QDA(X,y):
    clf = QuadraticDiscriminantAnalysis()
    scores = cross_val_score(clf, X, y, cv=5)
    return scores

def test_Forest(X,y):
    clf = RandomForestClassifier(n_estimators=10)
    scores = cross_val_score(clf, X, y, cv=5)
    return scores

def test_Tree(X,y):
    clf = DecisionTreeClassifier(max_depth=5)
    scores = cross_val_score(clf, X, y, cv=5)
    return scores

def test_KNC(X,y):
    clf = KNeighborsClassifier(3).fit(X, y)
    scores = cross_val_score(clf, X, y, cv=5)
    return scores

def test_NCC(X,y):
    clf = NearestCentroid().fit(X, y)
    scores = cross_val_score(clf, X, y, cv=5)
    return scores

def test_SVC(X,y):
    clf = svm.SVC(kernel='linear', C=0.1).fit(X,y)
    scores = cross_val_score(clf, X, y, cv=5)
    return scores
    
def test_SVC_RBF(X,y):
    clf = svm.SVC(kernel='rbf', C=1).fit(X,y)
    scores = cross_val_score(clf, X, y, cv=5)
    return scores
    
def test_Bayes(X,y):
    clf = GaussianNB().fit(X,y)
    scores = cross_val_score(clf, X, y, cv=5)
    return scores
    
def test_MLP(X, y):
    clf = MLPClassifier(alpha=1, max_iter=1000).fit(X, y)
    scores = cross_val_score(clf, X, y, cv=5)
    return scores


def standardize_method_1(x, maxValue=5):
    results = np.zeros([x.shape[0], 3])
    for i,value in enumerate(x):
        for base in range(2,maxValue+1):      
            if value == 0:
                coeff, base, exponent = 0, 0, 0
            else:
                coeff, value = np.sign(value), abs(value)
                exponent = math.log(value,base)
            absExponent = abs(exponent)
            if absExponent <= maxValue:
                results[i:] = [coeff, base, exponent]
                break
    return results

def standardize_method_2(x, width=5):
    results = np.zeros([x.shape[0], width])
    for i,value in enumerate(x):
        char_list = np.array(list(str(int(value*(10**width)))[:width]))
        char_list = np.pad(char_list,(width-len(char_list),0))
        results[i:] = [int(c) for c in char_list]
    return results

def standardize_method_3(x):
    results = np.zeros([x.shape[0], 2])
    for i,value in enumerate(x):
        counter = 0
        sign = np.sign(value)
        value = abs(value)
        while True:
            if value >= 0 and value <= 1:
                break
            value = np.cbrt(value)
            counter = counter+1
        results[i:] = [sign*counter, np.modf(value)[0]*10]
    return results

def standardize_method_4(x):
    results = np.zeros([x.shape[0], 2])
    for i,value in enumerate(x):
        if value == 0:
            coeff, exponent = 0, 0
        else:
            coeff, value = np.sign(value), abs(value)
            exponent = math.log(value,2)
        results[i:] = [coeff, exponent]
    return results

def standardize_method_5(x):
    return x/(max(x[0:5])+1)

# Mock datasets

In [405]:
X_ = []
y_ = []
# House-prices dataset 
X_.append(data.load_boston()["data"])
y_.append(data.load_boston()["target"] > 20)
# Iris dataset
X_.append(data.load_iris()["data"])
y_.append(data.load_iris()["target"] == 1)
#functions_list = [test_SVC, test_SVC_RBF, test_Bayes, test_KNC, test_NCC, test_Tree, test_Forest]
functions_list = [test_SVC, test_SVC_RBF, test_Bayes, test_KNC, test_NCC, test_Tree, test_Forest]
df_res = pd.DataFrame(columns=["Function","Dataframe","Standard?","Average","Score1","Score2","Score3","Score4","Score5"])

# Test without standardization

In [406]:
%%time
for i, (X, y) in enumerate(zip(X_, y_)):
    for func in functions_list:
        scores = func(X,y)
        df_res.loc[len(df_res.index)] = np.concatenate([[func.__name__, i, "No", np.mean(scores)],scores])
np.shape(X)

Wall time: 617 ms


(150, 4)

# Test with standardize_method_1

In [407]:
%%time
maxValue = 10
for i, (X, y) in enumerate(zip(X_, y_)):
    X2 = np.zeros([X.shape[0], X.shape[1]*3])
    for j in range(X.shape[1]):
        X2[:,3*j:3*j+3] = standardize_method_1(X[:,j], maxValue)
    #X2 = Normalizer().fit(X2).transform(X2)
    for func in functions_list:        
        scores = func(X2,y)
        df_res.loc[len(df_res.index)] = np.concatenate([[func.__name__, i, "1", np.mean(scores)],scores])
np.shape(X2)

Wall time: 360 ms


(150, 12)

# Test with standardize_method_2

In [408]:
%%time
width = 5
for i, (X, y) in enumerate(zip(X_, y_)):
    X3 = np.zeros([X.shape[0], X.shape[1]*width])
    for j in range(X.shape[1]):        
        X3[:,width*j:width*j+width] = standardize_method_2(X[:,j], width)
    #X3 = Normalizer().fit(X3).transform(X3)
    for func in functions_list:
        scores = func(X3,y)
        df_res.loc[len(df_res.index)] = np.concatenate([[func.__name__, i, "2", np.mean(scores)],scores])
print(np.shape(X3))

(150, 20)
Wall time: 746 ms


# Test with standardize_method_3

In [409]:
%%time
for i, (X, y) in enumerate(zip(X_, y_)):
    X4 = np.zeros([X.shape[0], X.shape[1]*2])
    for j in range(X.shape[1]):
        X4[:,2*j:2*j+2] = standardize_method_3(X[:,j])
    #X4 = Normalizer().fit(X4).transform(X4)
    for func in functions_list:        
        scores = func(X4,y)
        df_res.loc[len(df_res.index)] = np.concatenate([[func.__name__, i, "3", np.mean(scores)],scores])
np.shape(X4)

Wall time: 680 ms


(150, 8)

# Test with standardize_method_4

In [410]:
%%time
for i, (X, y) in enumerate(zip(X_, y_)):
    X5 = np.zeros([X.shape[0], X.shape[1]*2])
    for j in range(X.shape[1]):
        X5[:,2*j:2*j+2] = standardize_method_4(X[:,j])
    #X5 = Normalizer().fit(X5).transform(X5)
    for func in functions_list:        
        scores = func(X5,y)
        df_res.loc[len(df_res.index)] = np.concatenate([[func.__name__, i, "4", np.mean(scores)],scores])
np.shape(X5)

Wall time: 339 ms


(150, 8)

# Test with standardize_method_5

In [411]:
%%time
for i, (X, y) in enumerate(zip(X_, y_)):
    X6 = np.zeros([X.shape[0], X.shape[1]])
    for j in range(X.shape[1]):
        X6[:,j] = standardize_method_5(X[:,j])
    X6 = Normalizer().fit(X6).transform(X6)
    for func in functions_list:        
        scores = func(X6,y)
        df_res.loc[len(df_res.index)] = np.concatenate([[func.__name__, i, "5", np.mean(scores)],scores])
np.shape(X6)

Wall time: 293 ms


(150, 4)

# Results

In [412]:
df_res

Unnamed: 0,Function,Dataframe,Standard?,Average,Score1,Score2,Score3,Score4,Score5
0,test_SVC,0,No,0.8141525917297612,0.8529411764705882,0.7128712871287128,0.7821782178217822,0.9801980198019802,0.7425742574257426
1,test_SVC_RBF,0,No,0.7315084449621433,0.5882352941176471,0.6237623762376238,0.8118811881188119,1.0,0.6336633663366337
2,test_Bayes,0,No,0.7907202484954378,0.6862745098039216,0.801980198019802,0.8217821782178217,1.0,0.6435643564356436
3,test_KNC,0,No,0.6860609590370802,0.5392156862745098,0.693069306930693,0.6831683168316832,0.9207920792079208,0.594059405940594
4,test_NCC,0,No,0.7255678509027373,0.5882352941176471,0.6138613861386139,0.8118811881188119,1.0,0.6138613861386139
...,...,...,...,...,...,...,...,...,...
79,test_Bayes,1,5,0.82,1.0,1.0,0.9,0.6,0.6
80,test_KNC,1,5,0.9733333333333334,1.0,1.0,0.9333333333333333,0.9666666666666667,0.9666666666666667
81,test_NCC,1,5,0.6666666666666667,1.0,1.0,0.6666666666666666,0.3333333333333333,0.3333333333333333
82,test_Tree,1,5,0.9200000000000002,0.9666666666666667,1.0,0.9333333333333333,0.9333333333333333,0.7666666666666667


# Conclusions

With the only exception of MLP classifier, standardization methods speeded up the learning time.

In [413]:
df1 = df_res[df_res["Standard?"]=="No"].loc[:,["Average","Score1","Score2","Score3","Score4","Score5"]].reset_index(drop=True)
df2 = df_res[df_res["Standard?"]=="1"].loc[:,["Average","Score1","Score2","Score3","Score4","Score5"]].reset_index(drop=True)
df3 = df_res[df_res["Standard?"]=="2"].loc[:,["Average","Score1","Score2","Score3","Score4","Score5"]].reset_index(drop=True)
df4 = df_res[df_res["Standard?"]=="3"].loc[:,["Average","Score1","Score2","Score3","Score4","Score5"]].reset_index(drop=True)
df5 = df_res[df_res["Standard?"]=="4"].loc[:,["Average","Score1","Score2","Score3","Score4","Score5"]].reset_index(drop=True)
df6 = df_res[df_res["Standard?"]=="5"].loc[:,["Average","Score1","Score2","Score3","Score4","Score5"]].reset_index(drop=True)

In [414]:
df2.ge(df1).sum()["Average"]/len(df1)

0.6428571428571429

In [415]:
df3.ge(df1).sum()["Average"]/len(df1)

0.2857142857142857

In [416]:
df4.ge(df1).sum()["Average"]/len(df1)

0.14285714285714285

In [417]:
df5.ge(df1).sum()["Average"]/len(df1)

0.6428571428571429

In [418]:
df6.ge(df1).sum()["Average"]/len(df1)

0.5714285714285714