In [10]:
import bdsm
import re, seaborn as sns
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

In [11]:
df_penguin = bdsm.penguins().clean().to_numeric()
df_penguin

Unnamed: 0,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species_cat,Island_cat,Sex_cat
0,39.100000,18.700000,181.000000,3750.000000,0,2,2
1,39.500000,17.400000,186.000000,3800.000000,0,2,1
2,40.300000,18.000000,195.000000,3250.000000,0,2,1
3,38.791391,18.346358,189.953642,3700.662252,0,2,3
4,36.700000,19.300000,193.000000,3450.000000,0,2,1
...,...,...,...,...,...,...,...
340,43.500000,18.100000,202.000000,3400.000000,1,1,1
341,49.600000,18.200000,193.000000,3775.000000,1,1,2
342,50.800000,19.000000,210.000000,4100.000000,1,1,2
343,50.800000,19.000000,210.000000,4100.000000,1,1,2


In [12]:
list(df_penguin.columns)

['CulmenLength',
 'CulmenDepth',
 'FlipperLength',
 'BodyMass',
 'Species_cat',
 'Island_cat',
 'Sex_cat']

In [21]:
bdsm.penguins().clean()['Sex'].value_counts()

Male       169
Female     165
Unknown     10
.            1
Name: Sex, dtype: int64

In [13]:
x = df_penguin[['CulmenLength', 'CulmenDepth', 'FlipperLength', 'BodyMass']]
y = df_penguin['Sex_cat']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=147)

In [14]:
model = LogisticRegression(max_iter=10_000)
model.fit(x_train, y_train);

In [15]:
species_categories = dict(enumerate(bdsm.penguins()['Species'].astype("category").cat.categories))
predicitons = model.predict(x_test)

print(f"classification_report: \n{metrics.classification_report(y_test, predicitons)}")
print(f"accuracy_score: {metrics.accuracy_score(y_test, predicitons)}")
print(f"cohen_kappa_score: {metrics.cohen_kappa_score(y_test, predicitons)}")

classification_report: 
              precision    recall  f1-score   support

           1       0.85      0.85      0.85        55
           2       0.83      0.92      0.88        53
           3       0.00      0.00      0.00         6

    accuracy                           0.84       114
   macro avg       0.56      0.59      0.58       114
weighted avg       0.80      0.84      0.82       114

accuracy_score: 0.8421052631578947
cohen_kappa_score: 0.7001753360607832


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
# Detailed prediction data print
# Just extras
df_predicted = pd.Series(predicitons).map(species_categories).to_frame(name='y_hat_cat')
df_predicted["y_hat"] = pd.Series(predicitons).reset_index(drop=True)
df_predicted["y"] = pd.Series(y_test).reset_index(drop=True)
df_predicted["correct"] = df_predicted["y_hat"] == df_predicted["y"]
print(f"{df_predicted} \n")


                                     y_hat_cat  y_hat  y  correct
0    Chinstrap penguin (Pygoscelis antarctica)      1  1     True
1    Chinstrap penguin (Pygoscelis antarctica)      1  1     True
2    Chinstrap penguin (Pygoscelis antarctica)      1  1     True
3    Chinstrap penguin (Pygoscelis antarctica)      1  1     True
4    Chinstrap penguin (Pygoscelis antarctica)      1  1     True
..                                         ...    ... ..      ...
109          Gentoo penguin (Pygoscelis papua)      2  2     True
110          Gentoo penguin (Pygoscelis papua)      2  2     True
111          Gentoo penguin (Pygoscelis papua)      2  2     True
112  Chinstrap penguin (Pygoscelis antarctica)      1  1     True
113          Gentoo penguin (Pygoscelis papua)      2  1    False

[114 rows x 4 columns] 

