In [376]:
import pandas as pd
import numpy as np
import dalex as dx
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler

In [2]:
apartaments = dx.datasets.load_apartments()

In [25]:
apartaments.keys() # look what the columns are

Index(['m2_price', 'construction_year', 'surface', 'floor', 'no_rooms',
       'district'],
      dtype='object')

In [79]:
apartaments # our dataset

Unnamed: 0,m2_price,construction_year,surface,floor,no_rooms,district
1,5897,1953,25,3,1,Srodmiescie
2,1818,1992,143,9,5,Bielany
3,3643,1937,56,1,2,Praga
4,3517,1995,93,7,3,Ochota
5,3013,1992,144,6,5,Mokotow
...,...,...,...,...,...,...
996,6355,1921,44,2,2,Srodmiescie
997,3422,1921,48,10,2,Bemowo
998,3098,1980,85,3,3,Bemowo
999,4192,1942,36,7,1,Zoliborz


In [30]:
y = np.array(apartaments['district'])
X = apartaments.drop(['district'], axis=1)

In [214]:
'''
the set is quite small so we will use cross_validation, the test will be 20% of the whole set
'''
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)   

In [176]:
model_a = SVC() # our model, without parameters

In [177]:
model_a.fit(X_train, y_train) 
y_hat = model_a.predict(X_test)
accuracy_score(y_test, y_hat)

0.245

Without any corrections, our model has 24% accuracy which is a rather poor result. We will now try to improve it

In [178]:
# we choose parameters
params = {
    'degree': [2,3,4,5],  
    'C': [0.001, 0.01, 0.1, 1, 10, 20, 100, 1000],
    'gamma': ['auto', 'scale']
}

clf_rand = RandomizedSearchCV(model_a, params, cv=3, n_iter=5)

clf_rand.fit(X_train, y_train)

RandomizedSearchCV(cv=3, estimator=SVC(), n_iter=5,
                   param_distributions={'C': [0.001, 0.01, 0.1, 1, 10, 20, 100,
                                              1000],
                                        'degree': [2, 3, 4, 5],
                                        'gamma': ['auto', 'scale']})

In [179]:
pd.DataFrame(clf_rand.cv_results_).keys() # look at what are the columns, and then choose the ones that interest me

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_gamma', 'param_degree', 'param_C', 'params', 'split0_test_score',
       'split1_test_score', 'split2_test_score', 'mean_test_score',
       'std_test_score', 'rank_test_score'],
      dtype='object')

In [180]:
pd.DataFrame(clf_rand.cv_results_)[['params', 'mean_test_score']].sort_values('mean_test_score', ascending=False)

Unnamed: 0,params,mean_test_score
4,"{'gamma': 'scale', 'degree': 5, 'C': 1000}",0.256216
0,"{'gamma': 'scale', 'degree': 5, 'C': 100}",0.248721
3,"{'gamma': 'scale', 'degree': 3, 'C': 100}",0.248721
2,"{'gamma': 'scale', 'degree': 4, 'C': 1}",0.229994
1,"{'gamma': 'scale', 'degree': 2, 'C': 10}",0.227474


We can see that we managed to slightly improve the result for the parameters C: 1000, degree: 5, gamma: 'scale'. From what I know, the regularity factor C for the SVM model, the bigger the bigger the overfitting, which is different as it is in other models. We will now try normalization and standardization. The article shows that standardization should be worse off

In [241]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train=scaler.transform(X_train)
scaler = MinMaxScaler()
scaler.fit(X_test)
X_test=scaler.transform(X_test)

In [256]:
params = {
    'degree': [2,3,4,5,6,7,8,9],
    'C': [0.001, 0.01, 0.1, 1, 10, 20, 100, 1000],
    'gamma': ['auto', 'scale']
}

clf_rand = RandomizedSearchCV(model_a, params, cv=3, n_iter=10)

clf_rand.fit(X_train, y_train)

RandomizedSearchCV(cv=3, estimator=SVC(),
                   param_distributions={'C': [0.001, 0.01, 0.1, 1, 10, 20, 100,
                                              1000],
                                        'degree': [2, 3, 4, 5, 6, 7, 8, 9],
                                        'gamma': ['auto', 'scale']})

In [257]:
pd.DataFrame(clf_rand.cv_results_)[['params', 'mean_test_score']].sort_values('mean_test_score', ascending=False)

Unnamed: 0,params,mean_test_score
1,"{'gamma': 'scale', 'degree': 2, 'C': 10}",0.319943
7,"{'gamma': 'auto', 'degree': 9, 'C': 20}",0.308726
0,"{'gamma': 'scale', 'degree': 8, 'C': 20}",0.306224
2,"{'gamma': 'auto', 'degree': 3, 'C': 0.001}",0.111247
5,"{'gamma': 'auto', 'degree': 5, 'C': 0.1}",0.111247
6,"{'gamma': 'auto', 'degree': 7, 'C': 0.1}",0.111247
8,"{'gamma': 'auto', 'degree': 9, 'C': 0.01}",0.111247
9,"{'gamma': 'auto', 'degree': 2, 'C': 0.1}",0.111247
3,"{'gamma': 'scale', 'degree': 3, 'C': 0.001}",0.107488
4,"{'gamma': 'scale', 'degree': 6, 'C': 0.001}",0.107488


In [262]:
model_a_norm = SVC(degree = 2, C = 10, gamma = 'scale')
cross_val_score(model_a_norm, X_train, y_train, cv=3).mean()
model_a_norm.fit(X_train, y_train)
# we are now looking at the set test
y_hat = model_a_norm.predict(X_test)
accuracy_score(y_hat, y_test)

0.335

I performed the above operation several times. Often the dataframe with parameters showed C = 1000. But intuitively from what I wrote above 1000 gives a lot of overfitting really, although on the test set it is not tragic taking a smaller C it is better. At the moment, C = 10, although a simple check shows that C = 1, with the remaining parameters, it does better, giving 0.355. In one attempt, I managed to have even 0.37, but I did not save these parameters :(

In [268]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)
scaler = StandardScaler()
scaler.fit(X_train)
X_train=scaler.transform(X_train)
scaler = StandardScaler()
scaler.fit(X_test)
X_test=scaler.transform(X_test)

In [269]:
params = {
    'degree': [2,3,4,5,6,7,8,9],
    'C': [0.001, 0.01, 0.1, 1, 10, 20, 100, 1000],
    'gamma': ['auto', 'scale']
}

clf_rand = RandomizedSearchCV(model_a, params, cv=3, n_iter=10)

clf_rand.fit(X_train, y_train)

RandomizedSearchCV(cv=3, estimator=SVC(),
                   param_distributions={'C': [0.001, 0.01, 0.1, 1, 10, 20, 100,
                                              1000],
                                        'degree': [2, 3, 4, 5, 6, 7, 8, 9],
                                        'gamma': ['auto', 'scale']})

In [270]:
pd.DataFrame(clf_rand.cv_results_)[['params', 'mean_test_score']].sort_values('mean_test_score', ascending=False)

Unnamed: 0,params,mean_test_score
7,"{'gamma': 'scale', 'degree': 7, 'C': 20}",0.306206
1,"{'gamma': 'scale', 'degree': 3, 'C': 100}",0.304967
5,"{'gamma': 'auto', 'degree': 5, 'C': 20}",0.304952
0,"{'gamma': 'auto', 'degree': 4, 'C': 100}",0.30247
2,"{'gamma': 'scale', 'degree': 9, 'C': 1}",0.302451
8,"{'gamma': 'scale', 'degree': 6, 'C': 1}",0.302451
3,"{'gamma': 'scale', 'degree': 8, 'C': 0.001}",0.104982
4,"{'gamma': 'auto', 'degree': 3, 'C': 0.001}",0.104982
6,"{'gamma': 'auto', 'degree': 6, 'C': 0.01}",0.104982
9,"{'gamma': 'auto', 'degree': 9, 'C': 0.01}",0.104982


In [271]:
model_a_norm = SVC(degree = 7, C = 20, gamma = 'scale')
cross_val_score(model_a_norm, X_train, y_train, cv=3).mean()
model_a_norm.fit(X_train, y_train)
# we are now looking at the set test
y_hat = model_a_norm.predict(X_test)
accuracy_score(y_hat, y_test)

0.325

In fact, in this case, it is hard for me to say whether standardization is worse than standardization

# Drugi zbiór heart

In [346]:
heart = pd.read_csv('heart.csv')
d = {'Present': 1, 'Absent':0}
heart['famhist'] = heart['famhist'].map(d)

In [347]:
y = np.array(heart['chd'])
X = heart.drop(['chd'], axis=1)

In [348]:
'''
the collection is quite small
# so we will use cross_validation, the test will be 20% of the whole set
'''
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [349]:
model_h = SVC() # our model, without parameters

In [350]:
model_h.fit(X_train, y_train) 
# test
y_hat = model_h.predict(X_test)
accuracy_score(y_test, y_hat)

0.6451612903225806

In [351]:
# we choose parameters
params = {
    'degree': [2,3,4,5],  
    'C': [0.001, 0.01, 0.1, 1, 10, 20, 100, 1000],
    'gamma': ['auto', 'scale']
}

clf_rand = RandomizedSearchCV(model_h, params, cv=3, n_iter=10)

clf_rand.fit(X_train, y_train)

RandomizedSearchCV(cv=3, estimator=SVC(),
                   param_distributions={'C': [0.001, 0.01, 0.1, 1, 10, 20, 100,
                                              1000],
                                        'degree': [2, 3, 4, 5],
                                        'gamma': ['auto', 'scale']})

In [352]:
pd.DataFrame(clf_rand.cv_results_)[['params', 'mean_test_score']].sort_values('mean_test_score', ascending=False)

Unnamed: 0,params,mean_test_score
0,"{'gamma': 'scale', 'degree': 5, 'C': 100}",0.696477
1,"{'gamma': 'scale', 'degree': 3, 'C': 10}",0.680217
4,"{'gamma': 'scale', 'degree': 4, 'C': 1}",0.663957
6,"{'gamma': 'scale', 'degree': 3, 'C': 1}",0.663957
2,"{'gamma': 'auto', 'degree': 5, 'C': 1000}",0.658537
3,"{'gamma': 'scale', 'degree': 2, 'C': 0.001}",0.658537
5,"{'gamma': 'auto', 'degree': 5, 'C': 0.1}",0.658537
7,"{'gamma': 'auto', 'degree': 4, 'C': 0.01}",0.658537
8,"{'gamma': 'auto', 'degree': 3, 'C': 1}",0.658537
9,"{'gamma': 'auto', 'degree': 5, 'C': 0.01}",0.658537


In [353]:
model_h_p = SVC(degree = 5, C = 100, gamma = 'scale')
cross_val_score(model_h_p, X_train, y_train, cv=3).mean()
model_h_p.fit(X_train, y_train)
# we are now looking at the set test
y_hat = model_h_p.predict(X_test)
accuracy_score(y_hat, y_test)

0.6989247311827957

We can see that by tuning the model we have a better result by nearly 5%. Now we are going to normalize and standardize

In [354]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train=scaler.transform(X_train)
scaler = MinMaxScaler()
scaler.fit(X_test)
X_test=scaler.transform(X_test)

In [355]:
# we choose parameters
params = {
    'degree': [2,3,4,5],  
    'C': [0.001, 0.01, 0.1, 1, 10, 20, 100, 1000],
    'gamma': ['auto', 'scale']
}

clf_rand = RandomizedSearchCV(model_h, params, cv=3, n_iter=10)

clf_rand.fit(X_train, y_train)

RandomizedSearchCV(cv=3, estimator=SVC(),
                   param_distributions={'C': [0.001, 0.01, 0.1, 1, 10, 20, 100,
                                              1000],
                                        'degree': [2, 3, 4, 5],
                                        'gamma': ['auto', 'scale']})

In [356]:
pd.DataFrame(clf_rand.cv_results_)[['params', 'mean_test_score']].sort_values('mean_test_score', ascending=False)

Unnamed: 0,params,mean_test_score
5,"{'gamma': 'auto', 'degree': 2, 'C': 20}",0.723577
7,"{'gamma': 'scale', 'degree': 5, 'C': 1}",0.723577
9,"{'gamma': 'auto', 'degree': 5, 'C': 20}",0.723577
2,"{'gamma': 'auto', 'degree': 4, 'C': 10}",0.718157
0,"{'gamma': 'auto', 'degree': 4, 'C': 0.001}",0.658537
1,"{'gamma': 'auto', 'degree': 3, 'C': 0.001}",0.658537
4,"{'gamma': 'scale', 'degree': 3, 'C': 0.1}",0.658537
6,"{'gamma': 'scale', 'degree': 3, 'C': 0.01}",0.658537
8,"{'gamma': 'auto', 'degree': 3, 'C': 0.1}",0.658537
3,"{'gamma': 'scale', 'degree': 4, 'C': 1000}",0.628726


In [369]:
model_h_norm = SVC(degree = 5, C = 20, gamma = 'auto')
cross_val_score(model_h_norm, X_train, y_train, cv=3).mean()
model_h_norm.fit(X_train, y_train)
# we are now looking at the set test
y_hat = model_h_norm.predict(X_test)
accuracy_score(y_hat, y_test)

0.7741935483870968

We see a significant improvement. Now for standardization

In [370]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)
scaler = StandardScaler()
scaler.fit(X_train)
X_train=scaler.transform(X_train)
scaler = StandardScaler()
scaler.fit(X_test)
X_test=scaler.transform(X_test)

In [371]:
# we choose parameters
params = {
    'degree': [2,3,4,5],  
    'C': [0.001, 0.01, 0.1, 1, 10, 20, 100, 1000],
    'gamma': ['auto', 'scale']
}

clf_rand = RandomizedSearchCV(model_h, params, cv=3, n_iter=10)

clf_rand.fit(X_train, y_train)

RandomizedSearchCV(cv=3, estimator=SVC(),
                   param_distributions={'C': [0.001, 0.01, 0.1, 1, 10, 20, 100,
                                              1000],
                                        'degree': [2, 3, 4, 5],
                                        'gamma': ['auto', 'scale']})

In [372]:
pd.DataFrame(clf_rand.cv_results_)[['params', 'mean_test_score']].sort_values('mean_test_score', ascending=False)

Unnamed: 0,params,mean_test_score
9,"{'gamma': 'scale', 'degree': 3, 'C': 10}",0.682927
6,"{'gamma': 'auto', 'degree': 4, 'C': 20}",0.672087
0,"{'gamma': 'scale', 'degree': 5, 'C': 100}",0.663957
8,"{'gamma': 'scale', 'degree': 4, 'C': 100}",0.663957
2,"{'gamma': 'auto', 'degree': 3, 'C': 0.01}",0.658537
3,"{'gamma': 'auto', 'degree': 2, 'C': 0.001}",0.658537
4,"{'gamma': 'auto', 'degree': 4, 'C': 100}",0.658537
5,"{'gamma': 'auto', 'degree': 3, 'C': 100}",0.658537
7,"{'gamma': 'scale', 'degree': 3, 'C': 0.001}",0.658537
1,"{'gamma': 'scale', 'degree': 3, 'C': 1000}",0.644986


In [374]:
model_h_norm = SVC(degree = 3, C = 10, gamma = 'scale')
cross_val_score(model_h_norm, X_train, y_train, cv=3).mean()
model_h_norm.fit(X_train, y_train)
# we are now looking at the set test
y_hat = model_h_norm.predict(X_test)
accuracy_score(y_hat, y_test)

0.6881720430107527

Here, indeed, the standardizer is doing worse. The conclusion from all of this is that standardization actually helps, but you need to know which form to use