# Adam Trentowski - 162602 - Machine Learning
## lab_04/01
## Library Imports

In [41]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold, ShuffleSplit
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import warnings
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

warnings.filterwarnings('ignore')

## Data loading

In [2]:
data = pd.read_csv('VLagun_Phys_Years3.csv')

In [3]:
data.head()

Unnamed: 0,PSU,O2,temp.,SS,DOC,TPOC,Windspeedinsitu,Depth,Years
0,3.757624,9.46,18.3,52.0,7.5,8.565,3.5,3.3,0
1,3.504707,9.89,19.1,50.0,7.86,8.52,0.0,3.6,0
2,3.757624,9.66,18.1,59.0,8.172,8.4915,1.0,3.4,0
3,3.107266,10.36,19.5,46.0,7.848,8.832,0.0,2.9,0
4,2.619498,11.56,19.0,42.0,7.536,9.24,0.0,3.0,0


## Splitting features and target

In [5]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [6]:
X.shape, y.shape

((120, 8), (120,))

## Splitting data to train and test sets

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [9]:
features, targets = X, y

In [10]:
train_features, test_features, train_targets, test_targets = train_test_split(
    features, targets,
    train_size=0.7,
    test_size=0.3,
    random_state=23,
    stratify=targets
)

## KNN classifier training

In [11]:
classifier = KNeighborsClassifier()
classifier.fit(train_features, train_targets)

## KNN Predictions and accuracy

In [12]:
prediction_targets = classifier.predict(test_features)

In [15]:
print(f'Accuracy: {np.sum(prediction_targets == test_targets) / float(len(test_targets)):.2f}')

Accuracy: 0.83


## Cross validation

In [17]:
classifier = KNeighborsClassifier()

In [18]:
scores = cross_val_score(classifier, features, targets, cv=3)

In [22]:
print(f'Cross validation scores: {scores}')
print(f'Mean score: {np.mean(scores):.2f}')

Cross validation scores: [0.75  0.725 0.6  ]
Mean score: 0.69


## KFold

In [25]:
classifier = KNeighborsClassifier()
cv = KFold(n_splits=3, shuffle=True)

In [26]:
scores = cross_val_score(classifier, features, targets, cv=cv)

In [27]:
print(f'Cross validation scores: {scores}')
print(f'Mean score: {np.mean(scores):.2f}')

Cross validation scores: [0.8 0.8 0.9]
Mean score: 0.83


## Stratified

In [29]:
classifier = KNeighborsClassifier()
cv = StratifiedKFold(n_splits=3, shuffle=True)

In [30]:
scores = cross_val_score(classifier, features, targets, cv=cv)

In [31]:
print(f'Cross validation scores: {scores}')
print(f'Mean score: {np.mean(scores):.2f}')

Cross validation scores: [0.85  0.875 0.775]
Mean score: 0.83


## Shuffle

In [32]:
classifier = KNeighborsClassifier()
cv = ShuffleSplit(n_splits=3, test_size=0.3)

In [33]:
scores = cross_val_score(classifier, features, targets, cv=cv)

In [34]:
print(f'Cross validation scores: {scores}')
print(f'Mean score: {np.mean(scores):.2f}')

Cross validation scores: [0.83333333 0.77777778 0.83333333]
Mean score: 0.81


## Automatic CV comparison of algorithms

In [36]:
models = []
models.append(('LogisticRegression', LogisticRegression()))
models.append(('KNeighborsClassifier', KNeighborsClassifier()))
models.append(('SVC', SVC()))
models.append(('DecisionTreeClassifier', DecisionTreeClassifier()))

### KFold

In [38]:
cv = KFold(n_splits=5, shuffle=True, random_state=23)

for name, model in models:
    score = cross_val_score(model, features, targets, cv=cv)
    print("Model:{0}, Score: mean={1:.5f}, var={2:.5f}".format(
        name,
        score.mean(),
        score.var()
    ))

Model:LogisticRegression, Score: mean=0.95833, var=0.00069
Model:KNeighborsClassifier, Score: mean=0.85000, var=0.00250
Model:SVC, Score: mean=0.75833, var=0.00375
Model:DecisionTreeClassifier, Score: mean=0.86667, var=0.00306


### Stratified

In [39]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=23)

for name, model in models:
    score = cross_val_score(model, features, targets, cv=cv)
    print("Model:{0}, Score: mean={1:.5f}, var={2:.5f}".format(
        name,
        score.mean(),
        score.var()
    ))

Model:LogisticRegression, Score: mean=0.95000, var=0.00167
Model:KNeighborsClassifier, Score: mean=0.81667, var=0.00111
Model:SVC, Score: mean=0.76667, var=0.00458
Model:DecisionTreeClassifier, Score: mean=0.86667, var=0.00236


### Shuffle

In [40]:
cv = ShuffleSplit(n_splits=5, random_state=23)

for name, model in models:
    score = cross_val_score(model, features, targets, cv=cv)
    print("Model:{0}, Score: mean={1:.5f}, var={2:.5f}".format(
        name,
        score.mean(),
        score.var()
    ))

Model:LogisticRegression, Score: mean=0.95000, var=0.00444
Model:KNeighborsClassifier, Score: mean=0.86667, var=0.00444
Model:SVC, Score: mean=0.75000, var=0.00833
Model:DecisionTreeClassifier, Score: mean=0.88333, var=0.00722


1. Highest CV Acc. Score (average) achieved - KFold/Stratified
2. Highest CV Acc. Score (average) for models achieved:
Logistic Regression:
-KFold: 0.95833
-Stratified: 0.95000
-Shuffle: 0.95000

Highest score: KFold

K-Nearest Neighbors:
-KFold: 0.85000
-Stratified: 0.81667
-Shuffle: 0.86667

Highest score: Shuffle

Support Vector Machine:
-KFold: 0.75833
-Stratified: 0.76667
-Shuffle: 0.75000

Highest score: Stratified

Decision Tree:
-KFold: 0.86667
Stratified: 0.86667
-Shuffle: 0.88333

Highest score: Shuffle