In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
df = sns.load_dataset('penguins')
df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


In [13]:
df.isnull().sum()

species               0
island                0
bill_length_mm        0
bill_depth_mm         0
flipper_length_mm     0
body_mass_g           0
sex                  11
dtype: int64

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [4]:
si = SimpleImputer()
impute = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm','body_mass_g']
df[impute] = si.fit_transform(df[impute])
df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.10000,18.70000,181.000000,3750.000000,Male
1,Adelie,Torgersen,39.50000,17.40000,186.000000,3800.000000,Female
2,Adelie,Torgersen,40.30000,18.00000,195.000000,3250.000000,Female
3,Adelie,Torgersen,43.92193,17.15117,200.915205,4201.754386,
4,Adelie,Torgersen,36.70000,19.30000,193.000000,3450.000000,Female
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,43.92193,17.15117,200.915205,4201.754386,
340,Gentoo,Biscoe,46.80000,14.30000,215.000000,4850.000000,Female
341,Gentoo,Biscoe,50.40000,15.70000,222.000000,5750.000000,Male
342,Gentoo,Biscoe,45.20000,14.80000,212.000000,5200.000000,Female


In [5]:
df.drop(columns=["sex"])  

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
0,Adelie,Torgersen,39.10000,18.70000,181.000000,3750.000000
1,Adelie,Torgersen,39.50000,17.40000,186.000000,3800.000000
2,Adelie,Torgersen,40.30000,18.00000,195.000000,3250.000000
3,Adelie,Torgersen,43.92193,17.15117,200.915205,4201.754386
4,Adelie,Torgersen,36.70000,19.30000,193.000000,3450.000000
...,...,...,...,...,...,...
339,Gentoo,Biscoe,43.92193,17.15117,200.915205,4201.754386
340,Gentoo,Biscoe,46.80000,14.30000,215.000000,4850.000000
341,Gentoo,Biscoe,50.40000,15.70000,222.000000,5750.000000
342,Gentoo,Biscoe,45.20000,14.80000,212.000000,5200.000000


In [6]:
ord = OrdinalEncoder()
df[['species']]=  ord.fit_transform(df[["species"]])

In [7]:
X = df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']]
y = df['species']

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=.25, random_state=0)


In [8]:
model = make_pipeline(StandardScaler(),LogisticRegression())
model.fit(xtrain, ytrain)

In [9]:
y_pred = model.predict(xtest)
print(confusion_matrix(ytest, y_pred))

[[42  0  0]
 [ 1 13  0]
 [ 0  0 30]]


In [12]:
print(classification_report(ytest, y_pred))

              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99        42
         1.0       1.00      0.93      0.96        14
         2.0       1.00      1.00      1.00        30

    accuracy                           0.99        86
   macro avg       0.99      0.98      0.98        86
weighted avg       0.99      0.99      0.99        86

