In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.simplefilter("ignore")

In [24]:
df = pd.read_csv("penguins_analyzed.csv")
df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,43.92193,17.15117,200.915205,4201.754386,MALE
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [10]:
X = df.drop("species", axis = 1)
y = df["species"]

# Train Test Split

In [15]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_absolute_error

In [16]:
label = LabelEncoder()
y = label.fit_transform(y)

***Finding Best Random State***

In [18]:
train = []
test = []
cv = []
mae = []

for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = i)

    step = ColumnTransformer(transformers = [
        ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0, 5]),
        ("scaler", StandardScaler(), [1, 2, 3, 4])
    ], remainder = "passthrough")

    pipe = Pipeline([
        ("step", step),
        ("model", LogisticRegression())
    ])

    pipe.fit(X_train, y_train)
    pred_train = pipe.predict(X_train)
    pred_test = pipe.predict(X_test)

    train_accuracy = pipe.score(X_train, y_train)
    test_accuracy = pipe.score(X_test, y_test)

    mae_score = mean_absolute_error(y_test, pred_test)
    cv_score = cross_val_score(pipe, X_train, y_train, cv = 5).mean()

    train.append(train_accuracy)
    test.append(test_accuracy)
    cv.append(cv_score)
    mae.append(mae_score)

ff = pd.DataFrame({
    "Train Model": train,
    "Test Model": test,
    "CV": cv,
    "MAE": mae
})

In [21]:
ff.sort_values(by = "MAE", ascending = True)

Unnamed: 0,Train Model,Test Model,CV,MAE
0,0.996364,1.000000,0.992727,0.000000
33,0.996364,1.000000,0.985455,0.000000
34,0.996364,1.000000,0.989091,0.000000
35,0.992727,1.000000,0.985455,0.000000
75,0.992727,1.000000,0.992727,0.000000
...,...,...,...,...
30,0.996364,0.971014,0.996364,0.043478
87,0.996364,0.971014,0.996364,0.043478
4,0.996364,0.971014,0.996364,0.043478
14,1.000000,0.971014,0.996364,0.043478


In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# ***GridSearchCV - SVC***

In [26]:
step = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0, 5]),
    ("scaler", StandardScaler(), [1, 2, 3, 4])
], remainder = "passthrough")

pipe = Pipeline([
    ("step", step),
    ("model", SVC())
])

svc_param_grid = {"model__C": [0.001, 0.1, 0, 10, 100], "model__kernel": ["linear", "poly", "sigmoid", "rbf"]}

svc_grid = GridSearchCV(pipe, svc_param_grid, cv = 5, scoring = "accuracy")

svc_grid.fit(X_train, y_train)

svc_grid.best_params_

{'model__C': 0.1, 'model__kernel': 'linear'}

# ***KNeighborsClassifier***

In [27]:
step = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0, 5]),
    ("scaler", StandardScaler(), [1, 2, 3, 4])
], remainder = "passthrough")

pipe = Pipeline([
    ("step", step),
    ("model", KNeighborsClassifier())
])

knn_param_grid = {"model__n_neighbors": list(range(1, 20))}

knn_grid = GridSearchCV(pipe, knn_param_grid, cv = 5, scoring = "accuracy")

knn_grid.fit(X_train, y_train)

knn_grid.best_params_

{'model__n_neighbors': 2}

# ***DecisionTreeClassifier***

In [29]:
step = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0, 5]),
    ("scaler", StandardScaler(), [1, 2, 3, 4])
], remainder = "passthrough")

pipe = Pipeline([
    ("step", step),
    ("model", DecisionTreeClassifier())
])

decision_param_grid = {"model__max_depth": list(range(1, 40)), "model__criterion": ["gini", "entropy"]}

decision_grid = GridSearchCV(pipe, decision_param_grid, cv = 5, scoring = "accuracy")

decision_grid.fit(X_train, y_train)

decision_grid.best_params_

{'model__criterion': 'entropy', 'model__max_depth': 8}

# ***RandomForestClassifier***

In [30]:
step = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0, 5]),
    ("scaler", StandardScaler(), [1, 2, 3, 4])
], remainder = "passthrough")

pipe = Pipeline([
    ("step", step),
    ("model", RandomForestClassifier())
])

random_param_grid = {"model__n_estimators": list(range(1, 20))}

random_grid = GridSearchCV(pipe, random_param_grid, cv = 5, scoring = "accuracy")

random_grid.fit(X_train, y_train)

random_grid.best_params_

{'model__n_estimators': 4}

# ***AdaBoostClassifier***

In [31]:
step = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0, 5]),
    ("scaler", StandardScaler(), [1, 2, 3, 4])
], remainder = "passthrough")

pipe = Pipeline([
    ("step", step),
    ("model", AdaBoostClassifier())
])

ada_param_grid = {"model__n_estimators": list(range(1, 20))}

ada_grid = GridSearchCV(pipe, ada_param_grid, cv = 5, scoring = "accuracy")

ada_grid.fit(X_train, y_train)

ada_grid.best_params_

{'model__n_estimators': 6}

# ***GradientBoostingClassifier***

In [32]:
step = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0, 5]),
    ("scaler", StandardScaler(), [1, 2, 3, 4])
], remainder = "passthrough")

pipe = Pipeline([
    ("step", step),
    ("model", GradientBoostingClassifier())
])

gradient_param_grid = {"model__n_estimators": list(range(1, 10)), "model__learning_rate" : [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]}

gradient_grid = GridSearchCV(pipe, gradient_param_grid, cv = 5, scoring = "accuracy")

gradient_grid.fit(X_train, y_train)

gradient_grid.best_params_

{'model__learning_rate': 0.9, 'model__n_estimators': 7}

# ***XGBClassifier***

In [33]:
step = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [0, 5]),
    ("scaler", StandardScaler(), [1, 2, 3, 4])
], remainder = "passthrough")

pipe = Pipeline([
    ("step", step),
    ("model", XGBClassifier())
])

xgb_param_grid = {"model__n_estimators": list(range(1, 10)), "model__learning_rate" : [0, 0.1, 0.2, 0.3, 0.4, 0.5],
                 "model__gamma": [0, 0.1, 0.2, 0.3, 0.4, 0.5]}

xgb_grid = GridSearchCV(pipe, xgb_param_grid, cv = 5, scoring = "accuracy")

xgb_grid.fit(X_train, y_train)

xgb_grid.best_params_

{'model__gamma': 0, 'model__learning_rate': 0.1, 'model__n_estimators': 8}