In [73]:
import bdsm
import re, seaborn as sns
import numpy as np
import pandas as pd
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import sklearn.metrics as metrics

In [74]:
df_penguin = bdsm.penguins().clean().to_numeric()
df_penguin

Unnamed: 0,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species_cat,Island_cat,Sex_cat
0,39.100000,18.700000,181.000000,3750.000000,0,2,1
1,39.500000,17.400000,186.000000,3800.000000,0,2,0
2,40.300000,18.000000,195.000000,3250.000000,0,2,0
3,38.791391,18.346358,189.953642,3700.662252,0,2,2
4,36.700000,19.300000,193.000000,3450.000000,0,2,0
...,...,...,...,...,...,...,...
340,43.500000,18.100000,202.000000,3400.000000,1,1,0
341,49.600000,18.200000,193.000000,3775.000000,1,1,1
342,50.800000,19.000000,210.000000,4100.000000,1,1,1
343,50.800000,19.000000,210.000000,4100.000000,1,1,1


In [75]:
list(df_penguin.columns)

['CulmenLength',
 'CulmenDepth',
 'FlipperLength',
 'BodyMass',
 'Species_cat',
 'Island_cat',
 'Sex_cat']

In [76]:
bdsm.penguins().clean()['Species'].value_counts()

Adelie Penguin (Pygoscelis adeliae)          152
Gentoo penguin (Pygoscelis papua)            124
Chinstrap penguin (Pygoscelis antarctica)     69
Name: Species, dtype: int64

In [77]:
x = df_penguin.drop(columns="Species_cat")
y = df_penguin['Species_cat']
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.3, random_state=12345)

In [78]:
knn_clf = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
log_clf = LogisticRegression(max_iter=10_000)
rf_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
svm_clf = SVC()

In [79]:
models = VotingClassifier(
    estimators=[
        ('knn', knn_clf),
        ('log', log_clf),
        ('rf', rf_clf),
        ('svm', svm_clf)
    ],
    voting='hard',

)
models.fit(x_train, y_train)

for clf in (knn_clf, log_clf, rf_clf, svm_clf, models):
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_valid)
    print(clf.__class__.__name__, accuracy_score(y_valid, y_pred))

KNeighborsClassifier 0.7307692307692307
LogisticRegression 0.9711538461538461
RandomForestClassifier 0.9519230769230769
SVC 0.7211538461538461
VotingClassifier 0.7788461538461539


In [80]:
species_categories = dict(enumerate(bdsm.penguins()['Species'].astype("category").cat.categories))
predicitons = models.predict(x_valid)

print(f"classification_report: \n{metrics.classification_report(y_valid, predicitons)}")
print(f"accuracy_score: {metrics.accuracy_score(y_valid, predicitons)}")
print(f"cohen_kappa_score: {metrics.cohen_kappa_score(y_valid, predicitons)}")

classification_report: 
              precision    recall  f1-score   support

           0       0.65      1.00      0.79        43
           1       1.00      0.18      0.31        22
           2       1.00      0.87      0.93        39

    accuracy                           0.78       104
   macro avg       0.88      0.68      0.68       104
weighted avg       0.86      0.78      0.74       104

accuracy_score: 0.7788461538461539
cohen_kappa_score: 0.635588056063376


In [81]:
# Detailed prediction data print
# Just extras
df_predicted = pd.Series(predicitons).map(species_categories).to_frame(name='y_hat_cat')
df_predicted["y_hat"] = pd.Series(predicitons).reset_index(drop=True)
df_predicted["y"] = pd.Series(y_valid).reset_index(drop=True)
df_predicted["correct"] = df_predicted["y_hat"] == df_predicted["y"]
print(f"{df_predicted} \n")


                               y_hat_cat  y_hat  y  correct
0    Adelie Penguin (Pygoscelis adeliae)      0  0     True
1      Gentoo penguin (Pygoscelis papua)      2  2     True
2    Adelie Penguin (Pygoscelis adeliae)      0  0     True
3      Gentoo penguin (Pygoscelis papua)      2  2     True
4    Adelie Penguin (Pygoscelis adeliae)      0  0     True
..                                   ...    ... ..      ...
99   Adelie Penguin (Pygoscelis adeliae)      0  0     True
100  Adelie Penguin (Pygoscelis adeliae)      0  1    False
101  Adelie Penguin (Pygoscelis adeliae)      0  1    False
102  Adelie Penguin (Pygoscelis adeliae)      0  0     True
103  Adelie Penguin (Pygoscelis adeliae)      0  0     True

[104 rows x 4 columns] 

