# Adam Trentowski - 162602 - Machine Learning
## lab_04/02
## Library Imports

In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold, ShuffleSplit
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
import warnings
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

warnings.filterwarnings('ignore')

## Data loading

In [3]:
data = pd.read_csv('VLagun_Phys_Years3.csv')

In [4]:
data.head()

Unnamed: 0,PSU,O2,temp.,SS,DOC,TPOC,Windspeedinsitu,Depth,Years
0,3.757624,9.46,18.3,52.0,7.5,8.565,3.5,3.3,0
1,3.504707,9.89,19.1,50.0,7.86,8.52,0.0,3.6,0
2,3.757624,9.66,18.1,59.0,8.172,8.4915,1.0,3.4,0
3,3.107266,10.36,19.5,46.0,7.848,8.832,0.0,2.9,0
4,2.619498,11.56,19.0,42.0,7.536,9.24,0.0,3.0,0


## Splitting features and target

In [5]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [6]:
X.shape, y.shape

((120, 8), (120,))

## Splitting data to train and test sets

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [8]:
X_train.shape, y_train.shape

((84, 8), (84,))

In [9]:
X_test.shape, y_test.shape

((36, 8), (36,))

In [10]:
clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
clf.score(X_test, y_test)

0.9444444444444444

## Linear
## Computing cross-validated metrics

In [12]:
clf = svm.SVC(kernel='linear', C=1, random_state=42)
scores = cross_val_score(clf, X, y, cv=5)
scores

array([0.83333333, 0.91666667, 0.91666667, 1.        , 0.95833333])

In [13]:
print("%0.4f accuracy with a standard deviation of %0.4f" % (scores.mean(), scores.std()))

0.9250 accuracy with a standard deviation of 0.0553


## Scoring = f1

In [15]:
scores = cross_val_score(clf, X, y, cv=5, scoring='f1_macro')
scores

array([0.83216783, 0.91608392, 0.91666667, 1.        , 0.95826087])

In [16]:
print("%0.4f accuracy with a standard deviation of %0.4f" % (scores.mean(), scores.std()))

0.9246 accuracy with a standard deviation of 0.0557


## Shuffle

In [17]:
n_samples = X.shape[0]

In [18]:
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
scores = cross_val_score(clf, X, y, cv=cv)
scores

array([0.94444444, 0.94444444, 0.91666667, 0.97222222, 0.97222222])

In [19]:
print("%0.4f accuracy with a standard deviation of %0.4f" % (scores.mean(), scores.std()))

0.9500 accuracy with a standard deviation of 0.0208


## Stratified

In [21]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
scores = cross_val_score(clf, X, y, cv=cv)
scores

array([0.91666667, 0.91666667, 0.95833333, 0.91666667, 0.95833333])

In [22]:
print("%0.4f accuracy with a standard deviation of %0.4f" % (scores.mean(), scores.std()))

0.9333 accuracy with a standard deviation of 0.0204


## Polynomial
## Computing cross-validated metrics

In [23]:
clf = svm.SVC(kernel='poly', C=1, random_state=42)
scores = cross_val_score(clf, X, y, cv=5)
scores

array([1.        , 0.79166667, 0.79166667, 0.66666667, 0.625     ])

In [24]:
print("%0.4f accuracy with a standard deviation of %0.4f" % (scores.mean(), scores.std()))

0.7750 accuracy with a standard deviation of 0.1307


## Scoring = f1

In [25]:
scores = cross_val_score(clf, X, y, cv=5, scoring='f1_macro')
scores

array([1.        , 0.77229602, 0.78221416, 0.59663866, 0.56363636])

In [26]:
print("%0.4f accuracy with a standard deviation of %0.4f" % (scores.mean(), scores.std()))

0.7430 accuracy with a standard deviation of 0.1562


## Shuffle

In [27]:
n_samples = X.shape[0]
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
scores = cross_val_score(clf, X, y, cv=cv)
scores

array([0.75      , 0.75      , 0.63888889, 0.77777778, 0.83333333])

In [28]:
print("%0.4f accuracy with a standard deviation of %0.4f" % (scores.mean(), scores.std()))

0.7500 accuracy with a standard deviation of 0.0633


## Stratified

In [29]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
scores = cross_val_score(clf, X, y, cv=cv)
scores

array([0.79166667, 0.70833333, 0.91666667, 0.79166667, 0.83333333])

In [30]:
print("%0.4f accuracy with a standard deviation of %0.4f" % (scores.mean(), scores.std()))

0.8083 accuracy with a standard deviation of 0.0677


## Radial
## Computing cross-validated metrics

In [32]:
clf = svm.SVC(kernel='rbf', C=1, random_state=42)
scores = cross_val_score(clf, X, y, cv=5)
scores

array([1.        , 0.79166667, 0.75      , 0.625     , 0.625     ])

In [33]:
print("%0.4f accuracy with a standard deviation of %0.4f" % (scores.mean(), scores.std()))

0.7583 accuracy with a standard deviation of 0.1379


## Scoring = f1

In [34]:
scores = cross_val_score(clf, X, y, cv=5, scoring='f1_macro')
scores

array([1.        , 0.77229602, 0.74285714, 0.60798548, 0.56363636])

In [35]:
print("%0.4f accuracy with a standard deviation of %0.4f" % (scores.mean(), scores.std()))

0.7374 accuracy with a standard deviation of 0.1531


## Shuffle

In [36]:
n_samples = X.shape[0]
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
scores = cross_val_score(clf, X, y, cv=cv)
scores

array([0.69444444, 0.77777778, 0.58333333, 0.77777778, 0.77777778])

In [37]:
print("%0.4f accuracy with a standard deviation of %0.4f" % (scores.mean(), scores.std()))

0.7222 accuracy with a standard deviation of 0.0766


## Stratified

In [38]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
scores = cross_val_score(clf, X, y, cv=cv)
scores

array([0.75      , 0.625     , 0.875     , 0.79166667, 0.79166667])

In [39]:
print("%0.4f accuracy with a standard deviation of %0.4f" % (scores.mean(), scores.std()))

0.7667 accuracy with a standard deviation of 0.0816


## Sigmoid
## Computing cross-validated metrics

In [40]:
clf = svm.SVC(kernel='sigmoid', C=1, random_state=42)
scores = cross_val_score(clf, X, y, cv=5)
scores

array([0.        , 0.20833333, 0.29166667, 0.375     , 0.5       ])

In [41]:
print("%0.4f accuracy with a standard deviation of %0.4f" % (scores.mean(), scores.std()))

0.2750 accuracy with a standard deviation of 0.1679


## Scoring = f1

In [42]:
scores = cross_val_score(clf, X, y, cv=5, scoring='f1_macro')
scores

array([0.        , 0.17241379, 0.29043478, 0.36507937, 0.33333333])

In [43]:
print("%0.4f accuracy with a standard deviation of %0.4f" % (scores.mean(), scores.std()))

0.2323 accuracy with a standard deviation of 0.1332


## Shuffle

In [44]:
n_samples = X.shape[0]
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
scores = cross_val_score(clf, X, y, cv=cv)
scores

array([0.30555556, 0.33333333, 0.44444444, 0.30555556, 0.36111111])

In [45]:
print("%0.4f accuracy with a standard deviation of %0.4f" % (scores.mean(), scores.std()))

0.3500 accuracy with a standard deviation of 0.0515


## Stratified

In [46]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
scores = cross_val_score(clf, X, y, cv=cv)
scores

array([0.25      , 0.375     , 0.125     , 0.33333333, 0.33333333])

In [47]:
print("%0.4f accuracy with a standard deviation of %0.4f" % (scores.mean(), scores.std()))

0.2833 accuracy with a standard deviation of 0.0890


### Results:
Linear:
cv5 - average acc = 0.925
scoring=f1 - average acc = 0.9246
shuffle - average acc = 0.9500
stratified - average acc = 0.9333

Highest = shuffle



Polynomial:
cv5 - average acc = 0.7750
scoring=f1 - average acc = 0.7430
shuffle - average acc = 0.7500
stratified - average acc = 0.8083

Highest = stratified



Radial:
cv5 - average acc = 0.7583
scoring=f1 - average acc = 0.7374
shuffle - average acc = 0.7222
stratified - average acc = 0.7667

Highest = stratified



Sigmoid:
cv5 - average acc = 0.2750
scoring=f1 - average acc = 0.2323
shuffle - average acc = 0.3500
stratified - average acc = 0.2833

Highest = shuffle