# Automatyczne porównanie skuteczności metod cross validations
## Mateusz Baran
### 155620

In [109]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
import warnings
warnings.filterwarnings('ignore')

In [110]:
data = pd.read_csv('data/VLagun_Phys_Years3.csv')

In [111]:
data.head()

Unnamed: 0,PSU,O2,temp.,SS,DOC,TPOC,Windspeedinsitu,Depth,Years
0,3.757624,9.46,18.3,52.0,7.5,8.565,3.5,3.3,0
1,3.504707,9.89,19.1,50.0,7.86,8.52,0.0,3.6,0
2,3.757624,9.66,18.1,59.0,8.172,8.4915,1.0,3.4,0
3,3.107266,10.36,19.5,46.0,7.848,8.832,0.0,2.9,0
4,2.619498,11.56,19.0,42.0,7.536,9.24,0.0,3.0,0


In [112]:
X = data.iloc[:,0:8]
y = data.iloc[:, 8]

In [113]:
X.shape, y.shape

((120, 8), (120,))

In [114]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [115]:
from sklearn.neighbors import KNeighborsClassifier

In [116]:
features, targets = X, y

In [117]:
train_features, test_features, train_targets, test_targets = train_test_split(
    features, 
    targets,
    train_size=0.7,
    test_size=0.3,
    random_state=23,
    stratify=targets,
)

In [118]:
# use KNeighborsClassifier for classification
classifier = KNeighborsClassifier()
classifier.fit(train_features, train_targets)
prediction_targets = classifier.predict(test_features)
print('Accuracy:', end=' ')
print(np.sum(prediction_targets == test_targets) / float(len(test_targets)))


Accuracy: 0.8333333333333334


In [119]:
# CROSS VALIDATION

In [120]:
from sklearn.model_selection import cross_val_score

In [121]:
classifier = KNeighborsClassifier()
scores = cross_val_score(classifier, features, targets, cv=3)
print('Cross validation scores:' ,scores)
print('Mean score:', np.mean(scores))

Cross validation scores: [0.75  0.725 0.6  ]
Mean score: 0.6916666666666668


In [122]:
from sklearn.model_selection import KFold, StratifiedKFold, ShuffleSplit

In [123]:
# K Fold
cv = KFold(n_splits=3, shuffle=True)

In [124]:
scores = cross_val_score(classifier, features, targets, cv=cv)
print('Cross validation scores:' ,scores)
print('Mean score:', np.mean(scores))

Cross validation scores: [0.825 0.9   0.775]
Mean score: 0.8333333333333334


In [125]:
# Stratified Fold
cv = StratifiedKFold(n_splits=3, shuffle=True)
scores = cross_val_score(classifier, features, targets, cv=cv)
print('Cross validation scores:' ,scores)
print('Mean score:', np.mean(scores))

Cross validation scores: [0.85  0.75  0.875]
Mean score: 0.8250000000000001


In [126]:
# Shuffle Split
cv = ShuffleSplit(n_splits=3, test_size=0.3)
scores = cross_val_score(classifier, features, targets, cv=cv)
print('Cross validation scores:' ,scores)
print('Mean score:', np.mean(scores))

Cross validation scores: [0.88888889 0.86111111 0.86111111]
Mean score: 0.8703703703703703


### Najlepszy średni wynik Cross validation wyszedł dla Stratified Fold = 0.8250 

## Automatic CV comparison of algorithms

In [127]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [128]:
models = []
models.append(('LogisticRegression', LogisticRegression()))
models.append(('KNeighborsClassifier', KNeighborsClassifier()))
models.append(('SVC', SVC()))
models.append(('DecisionTreeClassifier', DecisionTreeClassifier()))

In [129]:
cv = KFold(n_splits=5, shuffle=True, random_state=23)
for name, model in models:
    score = cross_val_score(model, features, targets, cv=cv)
    print('Model: {0}, Score: mean={1:0.5f}, var={2:0.5f}'.format(name, score.mean(), score.var()))

Model: LogisticRegression, Score: mean=0.95833, var=0.00069
Model: KNeighborsClassifier, Score: mean=0.85000, var=0.00250
Model: SVC, Score: mean=0.75833, var=0.00375
Model: DecisionTreeClassifier, Score: mean=0.85000, var=0.00250


### Średni najlepszy wynik jest dla modelu: LogisticRegresion chociaż jest już bardzo blisko przetrenowaniu (100), dlatego weług mnie, najlepszy model to: DecisionTreeClassifier oraz KNeighborsClassifier.

In [130]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=23)
for name, model in models:
    score = cross_val_score(model, features, targets, cv=cv)
    print('Model: {0}, Score: mean={1:0.5f}, var={2:0.5f}'.format(name, score.mean(), score.var()))

Model: LogisticRegression, Score: mean=0.95000, var=0.00167
Model: KNeighborsClassifier, Score: mean=0.81667, var=0.00111
Model: SVC, Score: mean=0.76667, var=0.00458
Model: DecisionTreeClassifier, Score: mean=0.85833, var=0.00111


### Średni najlepszy wynik jest dla modelu: LogisticRegresion chociaż jest już bardzo blisko przetrenowaniu (100), dlatego według mnie najlepszy wynik uzyskał model: DecisionTreeClassifier 

In [131]:
cv = ShuffleSplit(n_splits=5, random_state=23)
for name, model in models:
    score = cross_val_score(model, features, targets, cv=cv)
    print('Model: {0}, Score: mean={1:0.5f}, var={2:0.5f}'.format(name, score.mean(), score.var()))

Model: LogisticRegression, Score: mean=0.95000, var=0.00444
Model: KNeighborsClassifier, Score: mean=0.86667, var=0.00444
Model: SVC, Score: mean=0.75000, var=0.00833
Model: DecisionTreeClassifier, Score: mean=0.90000, var=0.00389


### Średni najlepszy wynik jest dla modelu: LogisticRegresion oraz DecisionTreeClassifier chociaż jest już bardzo blisko przetrenowaniu (100), dlatego według mnie najlepszy wynik uzyskał model: KNeighborsClassifier 

# Cross validation of SVM kernels porównanie

In [132]:
clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
clf.score(X_test, y_test)

0.9444444444444444

In [133]:
clf = svm.SVC(kernel='poly', C=1).fit(X_train, y_train)
clf.score(X_test, y_test)

0.75

In [134]:
clf = svm.SVC(kernel='rbf', C=1).fit(X_train, y_train)
clf.score(X_test, y_test)

0.6944444444444444

In [135]:
clf = svm.SVC(kernel='sigmoid', C=1).fit(X_train, y_train)
clf.score(X_test, y_test)

0.3055555555555556

### Najelpszy wynik uzyskał kernel='linear' ALE jest on bardzo blisko przetrenowaniu więc według mnie najlepszy wynik uzyskał kernel='poly'

In [136]:
# computing cross-validation metrics
# cv = 5

In [137]:
# K-Fold
for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
    clf = svm.SVC(kernel=kernel, C=1, random_state=42)
    scores = cross_val_score(clf, X, y, cv=5)
    print(f'----- {kernel} ------')
    print(scores)
    print('%0.4f accuracy with a standard deviation of %0.4f' % (score.mean(), scores.std()))

----- linear ------
[0.83333333 0.91666667 0.91666667 1.         0.95833333]
0.9000 accuracy with a standard deviation of 0.0553
----- poly ------
[1.         0.79166667 0.79166667 0.66666667 0.625     ]
0.9000 accuracy with a standard deviation of 0.1307
----- rbf ------
[1.         0.79166667 0.75       0.625      0.625     ]
0.9000 accuracy with a standard deviation of 0.1379
----- sigmoid ------
[0.         0.20833333 0.29166667 0.375      0.5       ]
0.9000 accuracy with a standard deviation of 0.1679


### Najlepszy wynik osiągnęło kernel=linear

In [138]:
# K-Fold (f1_macro)

for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
    clf = svm.SVC(kernel=kernel, C=1, random_state=42)
    scores = cross_val_score(clf, X, y, cv=5, scoring='f1_macro')
    print(f'----- {kernel} ------')
    print(scores)
    print('%0.4f accuracy with a standard deviation of %0.4f' % (score.mean(), scores.std()))

----- linear ------
[0.83216783 0.91608392 0.91666667 1.         0.95826087]
0.9000 accuracy with a standard deviation of 0.0557
----- poly ------
[1.         0.77229602 0.78221416 0.59663866 0.56363636]
0.9000 accuracy with a standard deviation of 0.1562
----- rbf ------
[1.         0.77229602 0.74285714 0.60798548 0.56363636]
0.9000 accuracy with a standard deviation of 0.1531
----- sigmoid ------
[0.         0.17241379 0.29043478 0.36507937 0.33333333]
0.9000 accuracy with a standard deviation of 0.1332


### Najlepszy wynik osiągnęło kernel=linear

In [139]:
from sklearn.model_selection import ShuffleSplit
for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
    clf = svm.SVC(kernel=kernel, C=1, random_state=42)
    cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
    scores = cross_val_score(clf, X, y, cv=cv)
    print(f'----- {kernel} ------')
    print(scores)
    print('%0.4f accuracy with a standard deviation of %0.4f' % (score.mean(), scores.std()))

----- linear ------
[0.94444444 0.94444444 0.91666667 0.97222222 0.97222222]
0.9000 accuracy with a standard deviation of 0.0208
----- poly ------
[0.75       0.75       0.63888889 0.77777778 0.83333333]
0.9000 accuracy with a standard deviation of 0.0633
----- rbf ------
[0.69444444 0.77777778 0.58333333 0.77777778 0.77777778]
0.9000 accuracy with a standard deviation of 0.0766
----- sigmoid ------
[0.30555556 0.33333333 0.44444444 0.30555556 0.36111111]
0.9000 accuracy with a standard deviation of 0.0515


### Najlepszy wynik osiągnęło kernel=linear

In [140]:
from sklearn.model_selection import StratifiedKFold

for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
    clf = svm.SVC(kernel=kernel, C=1, random_state=42)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    scores = cross_val_score(clf, X, y, cv=cv)
    print(f'----- {kernel} ------')
    print(scores)
    print('%0.4f accuracy with a standard deviation of %0.4f' % (score.mean(), scores.std()))

----- linear ------
[0.91666667 0.91666667 0.95833333 0.91666667 0.95833333]
0.9000 accuracy with a standard deviation of 0.0204
----- poly ------
[0.79166667 0.70833333 0.91666667 0.79166667 0.83333333]
0.9000 accuracy with a standard deviation of 0.0677
----- rbf ------
[0.75       0.625      0.875      0.79166667 0.79166667]
0.9000 accuracy with a standard deviation of 0.0816
----- sigmoid ------
[0.25       0.375      0.125      0.33333333 0.33333333]
0.9000 accuracy with a standard deviation of 0.0890


### Najlepszy wynik osiągnęło kernel=linear