In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=sns.load_dataset('penguins')

In [3]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [4]:
df.species.unique()

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [6]:
df.shape

(344, 7)

In [7]:
df.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [8]:
df.sex.unique()

array(['Male', 'Female', nan], dtype=object)

In [9]:
X=df.drop('species',axis=1)
y=df['species']

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [12]:
cat_cols=[col for col in df.columns if df[col].dtype=='object']
num_cols=[col for col in df.columns if df[col].dtype!='object']

In [13]:
cat_cols

['species', 'island', 'sex']

In [14]:
del cat_cols[0]

In [15]:
from sklearn.impute import SimpleImputer

In [16]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [18]:
num_pipeline=Pipeline(steps=[('impute',SimpleImputer(strategy='median')),('scale',StandardScaler())])
cat_pipeline=Pipeline(steps=[('impute',SimpleImputer(strategy='most_frequent')),('encode',OneHotEncoder())])

In [19]:
preprocessor=ColumnTransformer([('num_pipeline',num_pipeline,num_cols),('cat_pipeline',cat_pipeline,cat_cols)])

In [20]:
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)

In [21]:
from sklearn.tree import DecisionTreeClassifier

In [22]:
from sklearn.model_selection import GridSearchCV

In [23]:
params={
    'criterion':['gini','entropy','log_loss'],
    'max_depth':[1,2,3,4,6,8],
    'splitter':['best','random'],
    'max_features':['sqrt','log2']
}

In [24]:
grid=GridSearchCV(estimator=DecisionTreeClassifier(),param_grid=params,cv=5,scoring='accuracy',n_jobs=-1)

In [25]:
grid.fit(X_train,y_train)

In [26]:
grid.best_params_

{'criterion': 'log_loss',
 'max_depth': 6,
 'max_features': 'log2',
 'splitter': 'best'}

In [27]:
model=DecisionTreeClassifier(criterion='log_loss',max_depth=6,max_features='log2',splitter='best')

In [28]:
model.fit(X_train,y_train)

In [29]:
y_pred=model.predict(X_test)

In [30]:
y_pred

array(['Adelie', 'Chinstrap', 'Adelie', 'Gentoo', 'Gentoo', 'Adelie',
       'Adelie', 'Gentoo', 'Gentoo', 'Gentoo', 'Adelie', 'Adelie',
       'Adelie', 'Chinstrap', 'Adelie', 'Gentoo', 'Adelie', 'Gentoo',
       'Gentoo', 'Adelie', 'Gentoo', 'Gentoo', 'Adelie', 'Adelie',
       'Gentoo', 'Adelie', 'Chinstrap', 'Chinstrap', 'Adelie', 'Gentoo',
       'Chinstrap', 'Chinstrap', 'Adelie', 'Adelie', 'Gentoo',
       'Chinstrap', 'Adelie', 'Gentoo', 'Gentoo', 'Adelie', 'Gentoo',
       'Adelie', 'Adelie', 'Adelie', 'Gentoo', 'Adelie', 'Adelie',
       'Gentoo', 'Chinstrap', 'Chinstrap', 'Adelie', 'Adelie', 'Adelie',
       'Gentoo', 'Chinstrap', 'Gentoo', 'Adelie', 'Gentoo', 'Chinstrap',
       'Adelie', 'Gentoo', 'Adelie', 'Adelie', 'Chinstrap', 'Adelie',
       'Adelie', 'Gentoo', 'Chinstrap', 'Adelie'], dtype=object)

In [31]:
from sklearn.metrics import accuracy_score

In [32]:
accuracy_score(y_test,y_pred)

0.9565217391304348

In [33]:
import pickle

In [34]:
pickle.dump(model,open('model.pkl','wb'))

In [35]:
pickle.dump(preprocessor,open('preprocessor.pkl','wb'))