In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter("ignore")

In [2]:
df = pd.read_csv("penguins_analyzed.csv")
df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,43.92193,17.15117,200.915205,4201.754386,MALE
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [3]:
df.shape

(344, 7)

In [4]:
X = df.drop("species", axis = 1)
y = df["species"]

# ***Libraries***

In [9]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_absolute_error

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

In [10]:
label = LabelEncoder()
y = label.fit_transform(y)

***Train_Test_Split***

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# ***LogisticRegression***

In [12]:
step1 = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0, 5]),
    ("scaler", StandardScaler(), [1, 2, 3, 4])
], remainder = "passthrough")

model = LogisticRegression()

pipe = Pipeline([
    ("scaler", step1),
    ("model", model)
])

pipe.fit(X_train, y_train)
pred_test = pipe.predict(X_test)
print("Training Accuracy :", pipe.score(X_train, y_train))
print("Testing Accuracy :", pipe.score(X_test, y_test))
print("CV Score :", cross_val_score(pipe, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 0.9963636363636363
Testing Accuracy : 1.0
CV Score : 0.9927272727272728
MAE : 0.0


In [13]:
confusion_matrix(y_test, pred_test)

array([[34,  0,  0],
       [ 0, 10,  0],
       [ 0,  0, 25]], dtype=int64)

In [14]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        34
           1       1.00      1.00      1.00        10
           2       1.00      1.00      1.00        25

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69



# ***SVM - SVC***

In [15]:
step1 = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0, 5]),
    ("scaler", StandardScaler(), [1, 2, 3, 4])
], remainder = "passthrough")

model = SVC(C = 0.1, kernel = "linear")

pipe = Pipeline([
    ("scaler", step1),
    ("model", model)
])

pipe.fit(X_train, y_train)
pred_test = pipe.predict(X_test)
print("Training Accuracy :", pipe.score(X_train, y_train))
print("Testing Accuracy :", pipe.score(X_test, y_test))
print("CV Score :", cross_val_score(pipe, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 0.9927272727272727
Testing Accuracy : 1.0
CV Score : 0.9927272727272728
MAE : 0.0


In [16]:
confusion_matrix(y_test, pred_test)

array([[34,  0,  0],
       [ 0, 10,  0],
       [ 0,  0, 25]], dtype=int64)

In [17]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        34
           1       1.00      1.00      1.00        10
           2       1.00      1.00      1.00        25

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69



# ***KNeighborsClassifier***

In [18]:
step1 = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0, 5]),
    ("scaler", StandardScaler(), [1, 2, 3, 4])
], remainder = "passthrough")

model = KNeighborsClassifier(n_neighbors = 2)

pipe = Pipeline([
    ("scaler", step1),
    ("model", model)
])

pipe.fit(X_train, y_train)
pred_test = pipe.predict(X_test)
print("Training Accuracy :", pipe.score(X_train, y_train))
print("Testing Accuracy :", pipe.score(X_test, y_test))
print("CV Score :", cross_val_score(pipe, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 0.9963636363636363
Testing Accuracy : 0.9855072463768116
CV Score : 0.9963636363636365
MAE : 0.014492753623188406


In [19]:
confusion_matrix(y_test, pred_test)

array([[33,  1,  0],
       [ 0, 10,  0],
       [ 0,  0, 25]], dtype=int64)

In [20]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       1.00      0.97      0.99        34
           1       0.91      1.00      0.95        10
           2       1.00      1.00      1.00        25

    accuracy                           0.99        69
   macro avg       0.97      0.99      0.98        69
weighted avg       0.99      0.99      0.99        69



# ***DecisionTreeClassifier***

In [21]:
step1 = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0, 5]),
    ("scaler", StandardScaler(), [1, 2, 3, 4])
], remainder = "passthrough")

model = DecisionTreeClassifier(max_depth = 8, criterion = "entropy", random_state = 1)

pipe = Pipeline([
    ("scaler", step1),
    ("model", model)
])

pipe.fit(X_train, y_train)
pred_test = pipe.predict(X_test)
print("Training Accuracy :", pipe.score(X_train, y_train))
print("Testing Accuracy :", pipe.score(X_test, y_test))
print("CV Score :", cross_val_score(pipe, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 1.0
Testing Accuracy : 0.9855072463768116
CV Score : 0.9781818181818182
MAE : 0.014492753623188406


In [22]:
confusion_matrix(y_test, pred_test)

array([[34,  0,  0],
       [ 0,  9,  1],
       [ 0,  0, 25]], dtype=int64)

In [23]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        34
           1       1.00      0.90      0.95        10
           2       0.96      1.00      0.98        25

    accuracy                           0.99        69
   macro avg       0.99      0.97      0.98        69
weighted avg       0.99      0.99      0.99        69



# ***RandomForestClassifier***

In [24]:
step1 = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0, 5]),
    ("scaler", StandardScaler(), [1, 2, 3, 4])
], remainder = "passthrough")

model = RandomForestClassifier(n_estimators = 4, random_state = 0)

pipe = Pipeline([
    ("scaler", step1),
    ("model", model)
])

pipe.fit(X_train, y_train)
pred_test = pipe.predict(X_test)
print("Training Accuracy :", pipe.score(X_train, y_train))
print("Testing Accuracy :", pipe.score(X_test, y_test))
print("CV Score :", cross_val_score(pipe, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 0.9963636363636363
Testing Accuracy : 0.9855072463768116
CV Score : 0.9709090909090909
MAE : 0.014492753623188406


In [25]:
confusion_matrix(y_test, pred_test)

array([[34,  0,  0],
       [ 1,  9,  0],
       [ 0,  0, 25]], dtype=int64)

In [26]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99        34
           1       1.00      0.90      0.95        10
           2       1.00      1.00      1.00        25

    accuracy                           0.99        69
   macro avg       0.99      0.97      0.98        69
weighted avg       0.99      0.99      0.99        69



# ***AdaBoostClassifier***

In [27]:
step1 = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0, 5]),
    ("scaler", StandardScaler(), [1, 2, 3, 4])
], remainder = "passthrough")

model = AdaBoostClassifier(n_estimators = 6, random_state = 0, learning_rate = 0.8)

pipe = Pipeline([
    ("scaler", step1),
    ("model", model)
])

pipe.fit(X_train, y_train)
pred_test = pipe.predict(X_test)
print("Training Accuracy :", pipe.score(X_train, y_train))
print("Testing Accuracy :", pipe.score(X_test, y_test))
print("CV Score :", cross_val_score(pipe, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 0.9854545454545455
Testing Accuracy : 0.9855072463768116
CV Score : 0.9745454545454546
MAE : 0.014492753623188406


In [28]:
confusion_matrix(y_test, pred_test)

array([[34,  0,  0],
       [ 1,  9,  0],
       [ 0,  0, 25]], dtype=int64)

In [29]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99        34
           1       1.00      0.90      0.95        10
           2       1.00      1.00      1.00        25

    accuracy                           0.99        69
   macro avg       0.99      0.97      0.98        69
weighted avg       0.99      0.99      0.99        69



# ***GradientBoostingClassifier***

In [30]:
step1 = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0, 5]),
    ("scaler", StandardScaler(), [1, 2, 3, 4])
], remainder = "passthrough")

model = GradientBoostingClassifier(n_estimators = 7, random_state = 0, learning_rate = 0.9)

pipe = Pipeline([
    ("scaler", step1),
    ("model", model)
])

pipe.fit(X_train, y_train)
pred_test = pipe.predict(X_test)
print("Training Accuracy :", pipe.score(X_train, y_train))
print("Testing Accuracy :", pipe.score(X_test, y_test))
print("CV Score :", cross_val_score(pipe, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 1.0
Testing Accuracy : 1.0
CV Score : 0.9818181818181818
MAE : 0.0


In [31]:
confusion_matrix(y_test, pred_test)

array([[34,  0,  0],
       [ 0, 10,  0],
       [ 0,  0, 25]], dtype=int64)

In [32]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        34
           1       1.00      1.00      1.00        10
           2       1.00      1.00      1.00        25

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69



# ***XGBClassifier***

In [34]:
step1 = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0, 5]),
    ("scaler", StandardScaler(), [1, 2, 3, 4])
], remainder = "passthrough")

model = XGBClassifier(n_estimators = 8, random_state = 0, learning_rate = 0.1, gamma = 0)

pipe = Pipeline([
    ("scaler", step1),
    ("model", model)
])

pipe.fit(X_train, y_train)
pred_test = pipe.predict(X_test)
print("Training Accuracy :", pipe.score(X_train, y_train))
print("Testing Accuracy :", pipe.score(X_test, y_test))
print("CV Score :", cross_val_score(pipe, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 0.9963636363636363
Testing Accuracy : 1.0
CV Score : 0.9818181818181818
MAE : 0.0


In [35]:
confusion_matrix(y_test, pred_test)

array([[34,  0,  0],
       [ 0, 10,  0],
       [ 0,  0, 25]], dtype=int64)

In [36]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        34
           1       1.00      1.00      1.00        10
           2       1.00      1.00      1.00        25

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69



# ***FINAL MODEL***

In [37]:
step1 = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0, 5]),
    ("scaler", StandardScaler(), [1, 2, 3, 4])
], remainder = "passthrough")

model = GradientBoostingClassifier(n_estimators = 7, random_state = 0, learning_rate = 0.9)

pipe = Pipeline([
    ("scaler", step1),
    ("model", model)
])

pipe.fit(X_train, y_train)
pred_test = pipe.predict(X_test)
print("Training Accuracy :", pipe.score(X_train, y_train))
print("Testing Accuracy :", pipe.score(X_test, y_test))
print("CV Score :", cross_val_score(pipe, X_train, y_train, cv = 5).mean())
print("MAE :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 1.0
Testing Accuracy : 1.0
CV Score : 0.9818181818181818
MAE : 0.0


In [45]:
df["species"].unique()

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

In [46]:
final = pd.DataFrame({
    'island': "Torgersen" ,
    'culmen_length_mm': 39.1, 
    'culmen_depth_mm': 18.7, 
    'flipper_length_mm': 181.0,
    'body_mass_g': 3750.0, 
    'sex': "MALE"
    }, index = [0])

species = pipe.predict(final)[0]
if species == 0:
    print("This penguins belongs to Adelie species")
elif species == 1:
    print("This penguins belongs to Chinstrap species.")
else:
    print("This penguins belongs to Gentoo species.")

This penguins belongs to Adelie species


In [47]:
import pickle
with open("penguin_gradient_model.pkl", "wb") as file:
    pickle.dump(pipe, file)

In [48]:
with open("penguin_gradient_model.pkl", "rb") as file:
    loaded = pickle.load(file)