# Training the Penguin Model

This notebook trains and exports a model to predict penguin species based on dimensions.

## Imports

In [1]:
import pickle  # Save models

from numpy import nan  # cat's footstep
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import plot_confusion_matrix, classification_report

## Data sourcing

In [2]:
pngn = pd.read_csv('../data/penguins_size.csv')

FileNotFoundError: [Errno 2] No such file or directory: '../data/penguins_size.csv'

In [None]:
pngn.sample(2)

## Cleaning

In [None]:
# Simplify sex column

pngn['sex'] = pngn['sex'].replace('.', nan)

# Remove NA rows

pngn = pngn.dropna()

# Drop categorical columns

pngn = pngn.drop(['sex', 'island'], axis=1)

In [None]:
pngn.sample(3)

## Data splitting

In [None]:
# Get predictors/target

X = pngn.drop('species', axis=1)
y = pngn[['species']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y,
                                                    test_size=0.3,
                                                    random_state=18)

## Modelling

In [None]:
model = DecisionTreeClassifier(max_depth=4,
                               random_state=18)

In [None]:
model.fit(X_train, y_train)

## Evaluation

In [None]:
model.score(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
plot_confusion_matrix(model, X_test, y_test)

## Exporting the model

In [None]:
# Save the model itself

with open('../models/penguin_classifier', 'wb') as file:
    pickle.dump(model, file)