<a href="https://colab.research.google.com/github/OlegFalomkin/Tutorial_cheat_sheets/blob/main/Simple_classification_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [172]:
import seaborn as sns
import pandas as pd
import numpy as np


In [173]:
penguins = sns.load_dataset('penguins')

In [174]:
penguins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [175]:
penguins.head(5)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [176]:
penguins['island'].unique()

array(['Torgersen', 'Biscoe', 'Dream'], dtype=object)

In [177]:
y = penguins['island']
X = penguins.drop('island', axis=1)

In [178]:
species = y.unique()

In [179]:
species

array(['Torgersen', 'Biscoe', 'Dream'], dtype=object)

In [180]:
spec_d = {}
for i,s in enumerate(species):
  spec_d[s] = i

In [181]:
spec_d

{'Torgersen': 0, 'Biscoe': 1, 'Dream': 2}

In [182]:
y = y.apply(lambda x: spec_d[x])

In [183]:
y

0      0
1      0
2      0
3      0
4      0
      ..
339    1
340    1
341    1
342    1
343    1
Name: island, Length: 344, dtype: int64

In [184]:
from sklearn.model_selection import train_test_split

In [185]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [186]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer


In [187]:
lg_model = OneVsRestClassifier(LogisticRegression())


In [188]:
X.columns

Index(['species', 'bill_length_mm', 'bill_depth_mm', 'flipper_length_mm',
       'body_mass_g', 'sex'],
      dtype='object')

In [189]:
cat_atribs = ['species','sex']
num_values =['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm','body_mass_g']

In [190]:
cat_pipeline = make_pipeline(OneHotEncoder(handle_unknown="ignore"))
num_pipeline = make_pipeline(SimpleImputer(strategy='mean'),StandardScaler())
prep = ColumnTransformer([
        ("num", num_pipeline, num_values),
        ("cat", cat_pipeline, cat_atribs),
])

In [193]:
pipe_lg = Pipeline([('prep',prep), ('est',lg_model)])

In [194]:
pipe_lg.fit(X_train, y_train)

In [195]:
preds = pipe_lg.predict(X_test)

In [196]:
preds

array([1, 2, 1, 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2,
       1, 1, 2, 1, 2, 1, 1, 2, 2, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2,
       2, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 2, 2, 1, 2, 1,
       2, 2, 2])

In [197]:
np.array(y_test)

array([0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 2, 0, 1, 1, 1, 1, 0, 0, 2, 2, 1, 1, 1, 0,
       2, 1, 1, 1, 2, 2, 2, 1, 2, 0, 2, 1, 2, 1, 2, 0, 0, 2, 2, 1, 2, 1,
       2, 2, 0])

In [198]:
accuracy_score(y_test, preds)

0.6811594202898551

In [199]:
pipe_lg.score(X_test, y_test)

0.6811594202898551