In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_excel('../../datasets/Pumpkin_Seeds_Dataset.xlsx')

In [3]:
df.columns

Index(['Area', 'Perimeter', 'Major_Axis_Length', 'Minor_Axis_Length',
       'Convex_Area', 'Equiv_Diameter', 'Eccentricity', 'Solidity', 'Extent',
       'Roundness', 'Aspect_Ration', 'Compactness', 'Class'],
      dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Area               2500 non-null   int64  
 1   Perimeter          2500 non-null   float64
 2   Major_Axis_Length  2500 non-null   float64
 3   Minor_Axis_Length  2500 non-null   float64
 4   Convex_Area        2500 non-null   int64  
 5   Equiv_Diameter     2500 non-null   float64
 6   Eccentricity       2500 non-null   float64
 7   Solidity           2500 non-null   float64
 8   Extent             2500 non-null   float64
 9   Roundness          2500 non-null   float64
 10  Aspect_Ration      2500 non-null   float64
 11  Compactness        2500 non-null   float64
 12  Class              2500 non-null   object 
dtypes: float64(10), int64(2), object(1)
memory usage: 254.0+ KB


In [17]:
df['Class'].unique()

array([0, 1])

In [18]:
df['Class'].value_counts(normalize=True)

Class
0    0.52
1    0.48
Name: proportion, dtype: float64

In [7]:
df.head()

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Convex_Area,Equiv_Diameter,Eccentricity,Solidity,Extent,Roundness,Aspect_Ration,Compactness,Class
0,56276,888.242,326.1485,220.2388,56831,267.6805,0.7376,0.9902,0.7453,0.8963,1.4809,0.8207,Çerçevelik
1,76631,1068.146,417.1932,234.2289,77280,312.3614,0.8275,0.9916,0.7151,0.844,1.7811,0.7487,Çerçevelik
2,71623,1082.987,435.8328,211.0457,72663,301.9822,0.8749,0.9857,0.74,0.7674,2.0651,0.6929,Çerçevelik
3,66458,992.051,381.5638,222.5322,67118,290.8899,0.8123,0.9902,0.7396,0.8486,1.7146,0.7624,Çerçevelik
4,66107,998.146,383.8883,220.4545,67117,290.1207,0.8187,0.985,0.6752,0.8338,1.7413,0.7557,Çerçevelik


In [16]:
# Não há necessidade do LabelEncoder, apenas no target

from sklearn.calibration import LabelEncoder


labelencoder = LabelEncoder()
df['Class'] = labelencoder.fit_transform(df['Class'])

df.head()

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Convex_Area,Equiv_Diameter,Eccentricity,Solidity,Extent,Roundness,Aspect_Ration,Compactness,Class
0,56276,888.242,326.1485,220.2388,56831,267.6805,0.7376,0.9902,0.7453,0.8963,1.4809,0.8207,0
1,76631,1068.146,417.1932,234.2289,77280,312.3614,0.8275,0.9916,0.7151,0.844,1.7811,0.7487,0
2,71623,1082.987,435.8328,211.0457,72663,301.9822,0.8749,0.9857,0.74,0.7674,2.0651,0.6929,0
3,66458,992.051,381.5638,222.5322,67118,290.8899,0.8123,0.9902,0.7396,0.8486,1.7146,0.7624,0
4,66107,998.146,383.8883,220.4545,67117,290.1207,0.8187,0.985,0.6752,0.8338,1.7413,0.7557,0


In [8]:
df.describe().T.to_latex(
    buf='./pumpkins_describe.tex',
    decimal='.',
    float_format="%.2f"
)

In [9]:
# Scaler
scaler = RobustScaler()

features = pd.DataFrame(scaler.fit_transform(df.drop('Class', axis=1)), columns=df.columns[:-1])
target = df['Class']

In [10]:
len(target)

2500

In [19]:
# Metrics: Accuracy, Precision, Recall, F1
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score


def get_metrics():
    metrics = {
        'accuracy': accuracy_score,
        'precision': precision_score,
        'recall': recall_score,
        'f1': f1_score,
        'roc': roc_auc_score
    }
    return metrics


def get_metrics_df(y_true, y_pred, name):
    metrics = get_metrics()
    df = pd.DataFrame()
    for metric_name, metric in metrics.items():
        df[metric_name] = [metric(y_true, y_pred)]
    return df

In [20]:
SEEDS = [24, 42, 206, 602, 412, 214, 754, 457, 2023, 3202]

In [25]:
run_counter = 0
results = dict()

# Splitting dataset
X = df.drop(columns='Class')
y = df['Class']

for seed in SEEDS:
    run_counter += 1
    print(f'%{run_counter} - Running for seed: {seed}')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=seed)

    models_with_best_params = {
        'GaussianNB': GaussianNB(var_smoothing=0.01519911082952933),
        'DT': DecisionTreeClassifier(criterion='gini', max_depth=5, min_samples_leaf=1, min_samples_split=3),
        'KNN': KNeighborsClassifier(metric='manhattan', n_neighbors=9, weights='distance'),
        'RF': RandomForestClassifier(criterion='gini', max_depth=11, min_samples_leaf=1, min_samples_split=2, n_estimators=100),
    }

    for name, model in models_with_best_params.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        results[f"{model.__class__.__name__}_{seed}"] = get_metrics_df(y_test, y_pred, name)

%1 - Running for seed: 24
%2 - Running for seed: 42
%3 - Running for seed: 206
%4 - Running for seed: 602
%5 - Running for seed: 412
%6 - Running for seed: 214
%7 - Running for seed: 754
%8 - Running for seed: 457
%9 - Running for seed: 2023
%10 - Running for seed: 3202


In [26]:
results.keys()

dict_keys(['GaussianNB_24', 'DecisionTreeClassifier_24', 'KNeighborsClassifier_24', 'RandomForestClassifier_24', 'GaussianNB_42', 'DecisionTreeClassifier_42', 'KNeighborsClassifier_42', 'RandomForestClassifier_42', 'GaussianNB_206', 'DecisionTreeClassifier_206', 'KNeighborsClassifier_206', 'RandomForestClassifier_206', 'GaussianNB_602', 'DecisionTreeClassifier_602', 'KNeighborsClassifier_602', 'RandomForestClassifier_602', 'GaussianNB_412', 'DecisionTreeClassifier_412', 'KNeighborsClassifier_412', 'RandomForestClassifier_412', 'GaussianNB_214', 'DecisionTreeClassifier_214', 'KNeighborsClassifier_214', 'RandomForestClassifier_214', 'GaussianNB_754', 'DecisionTreeClassifier_754', 'KNeighborsClassifier_754', 'RandomForestClassifier_754', 'GaussianNB_457', 'DecisionTreeClassifier_457', 'KNeighborsClassifier_457', 'RandomForestClassifier_457', 'GaussianNB_2023', 'DecisionTreeClassifier_2023', 'KNeighborsClassifier_2023', 'RandomForestClassifier_2023', 'GaussianNB_3202', 'DecisionTreeClassif

In [27]:
gaussian_keys = [key for key in results.keys() if 'Gaussian' in key]
dt_keys = [key for key in results.keys() if 'DecisionTreeClassifier' in key]
knn_keys = [key for key in results.keys() if 'KNeighborsClassifier' in key]
rf_keys = [key for key in results.keys() if 'RandomForestClassifier' in key]

In [28]:
print(len(gaussian_keys))
assert len(gaussian_keys) == len(dt_keys) == len(knn_keys) == len(rf_keys)

10


In [29]:
# Gaussian accuracy mean
from tabulate import tabulate


# Gaussian accuracy mean
gaussian_accuracy_mean = np.mean([results[key]['accuracy'].values[0] for key in gaussian_keys])

# Gaussian accuracy std
gaussian_accuracy_std = np.std([results[key]['accuracy'].values[0] for key in gaussian_keys])

# Gaussian precision mean
gaussian_precision_mean = np.mean([results[key]['precision'].values[0] for key in gaussian_keys])

# Gaussian precision std
gaussian_precision_std = np.std([results[key]['precision'].values[0] for key in gaussian_keys])

# Gaussian recall mean
gaussian_recall_mean = np.mean([results[key]['recall'].values[0] for key in gaussian_keys])

# Gaussian recall std
gaussian_recall_std = np.std([results[key]['recall'].values[0] for key in gaussian_keys])

# Gaussian f1 mean
gaussian_f1_mean = np.mean([results[key]['f1'].values[0] for key in gaussian_keys])

# Gaussian f1 std
gaussian_f1_std = np.std([results[key]['f1'].values[0] for key in gaussian_keys])

# Gaussian roc mean
gaussian_roc_mean = np.mean([results[key]['roc'].values[0] for key in gaussian_keys])

# Gaussian roc std
gaussian_roc_std = np.std([results[key]['roc'].values[0] for key in gaussian_keys])


# Print tabulate
print(tabulate(
    [
        ['GaussianNB', 
         gaussian_accuracy_mean, gaussian_accuracy_std, 
         gaussian_precision_mean, gaussian_precision_std, 
         gaussian_recall_mean, gaussian_recall_std, 
         gaussian_f1_mean, gaussian_f1_std, 
         gaussian_roc_mean, gaussian_roc_std],
    ],
    headers=['Model', 'Accuracy Mean', 'Accuracy Std', 
             'Precision Mean', 'Precision Std', 
             'Recall Mean', 'Recall Std', 
             'F1 Mean', 'F1 Std', 
             'ROC Mean', 'ROC Std'],
    tablefmt='psql',
    floatfmt=(".4"),
))


# DT accuracy mean
dt_accuracy_mean = np.mean([results[key]['accuracy'].values[0] for key in dt_keys])

# DT accuracy std
dt_accuracy_std = np.std([results[key]['accuracy'].values[0] for key in dt_keys])

# DT precision mean
dt_precision_mean = np.mean([results[key]['precision'].values[0] for key in dt_keys])

# DT precision std
dt_precision_std = np.std([results[key]['precision'].values[0] for key in dt_keys])

# DT recall mean
dt_recall_mean = np.mean([results[key]['recall'].values[0] for key in dt_keys])

# DT recall std
dt_recall_std = np.std([results[key]['recall'].values[0] for key in dt_keys])

# DT f1 mean
dt_f1_mean = np.mean([results[key]['f1'].values[0] for key in dt_keys])

# DT f1 std
dt_f1_std = np.std([results[key]['f1'].values[0] for key in dt_keys])

# DT roc mean
dt_roc_mean = np.mean([results[key]['roc'].values[0] for key in dt_keys])

# DT roc std
dt_roc_std = np.std([results[key]['roc'].values[0] for key in dt_keys])


# Print tabulate
print(tabulate(
    [
        ['DecisionTreeClassifier', dt_accuracy_mean, dt_accuracy_std, 
         dt_precision_mean, dt_precision_std, 
         dt_recall_mean, dt_recall_std, 
         dt_f1_mean, dt_f1_std, 
         dt_roc_mean, dt_roc_std],
    ],
    headers=['Model', 'Accuracy Mean', 'Accuracy Std', 
             'Precision Mean', 'Precision Std', 
             'Recall Mean', 'Recall Std', 
             'F1 Mean', 'F1 Std', 
             'ROC Mean', 'ROC Std'],
    tablefmt='psql',
    floatfmt=(".4"),
))


# KNN accuracy mean
knn_accuracy_mean = np.mean([results[key]['accuracy'].values[0] for key in knn_keys])

# KNN accuracy std
knn_accuracy_std = np.std([results[key]['accuracy'].values[0] for key in knn_keys])

# KNN precision mean
knn_precision_mean = np.mean([results[key]['precision'].values[0] for key in knn_keys])

# KNN precision std
knn_precision_std = np.std([results[key]['precision'].values[0] for key in knn_keys])

# KNN recall mean
knn_recall_mean = np.mean([results[key]['recall'].values[0] for key in knn_keys])

# KNN recall std
knn_recall_std = np.std([results[key]['recall'].values[0] for key in knn_keys])

# KNN f1 mean
knn_f1_mean = np.mean([results[key]['f1'].values[0] for key in knn_keys])

# KNN f1 std
knn_f1_std = np.std([results[key]['f1'].values[0] for key in knn_keys])

# KNN roc mean
knn_roc_mean = np.mean([results[key]['roc'].values[0] for key in knn_keys])

# KNN roc std
knn_roc_std = np.std([results[key]['roc'].values[0] for key in knn_keys])


# Print tabulate
print(tabulate(
    [
        ['KNeighborsClassifier', knn_accuracy_mean, knn_accuracy_std, 
         knn_precision_mean, knn_precision_std, 
         knn_recall_mean, knn_recall_std, 
         knn_f1_mean, knn_f1_std, 
         knn_roc_mean, knn_roc_std],
    ],
    headers=['Model', 'Accuracy Mean', 'Accuracy Std', 
             'Precision Mean', 'Precision Std', 
             'Recall Mean', 'Recall Std', 
             'F1 Mean', 'F1 Std', 
             'ROC Mean', 'ROC Std'],
    tablefmt='psql',
    floatfmt=(".4"),
))


# RF accuracy mean
rf_accuracy_mean = np.mean([results[key]['accuracy'].values[0] for key in rf_keys])

# RF accuracy std
rf_accuracy_std = np.std([results[key]['accuracy'].values[0] for key in rf_keys])

# RF precision mean
rf_precision_mean = np.mean([results[key]['precision'].values[0] for key in rf_keys])

# RF precision std
rf_precision_std = np.std([results[key]['precision'].values[0] for key in rf_keys])

# RF recall mean
rf_recall_mean = np.mean([results[key]['recall'].values[0] for key in rf_keys])

# RF recall std
rf_recall_std = np.std([results[key]['recall'].values[0] for key in rf_keys])

# RF f1 mean
rf_f1_mean = np.mean([results[key]['f1'].values[0] for key in rf_keys])

# RF f1 std
rf_f1_std = np.std([results[key]['f1'].values[0] for key in rf_keys])

# RF roc mean
rf_roc_mean = np.mean([results[key]['roc'].values[0] for key in rf_keys])

# RF roc std
rf_roc_std = np.std([results[key]['roc'].values[0] for key in rf_keys])


# Print tabulate
print(tabulate(
    [
        ['RandomForestClassifier', rf_accuracy_mean, rf_accuracy_std, 
         rf_precision_mean, rf_precision_std, 
         rf_recall_mean, rf_recall_std, 
         rf_f1_mean, rf_f1_std, 
         rf_roc_mean, rf_roc_std],
    ],
    headers=['Model', 'Accuracy Mean', 'Accuracy Std', 
             'Precision Mean', 'Precision Std', 
             'Recall Mean', 'Recall Std', 
             'F1 Mean', 'F1 Std', 
             'ROC Mean', 'ROC Std'],
    tablefmt='psql',
    floatfmt=(".4"),
))

+------------+-----------------+----------------+------------------+-----------------+---------------+--------------+-----------+----------+------------+-----------+
| Model      |   Accuracy Mean |   Accuracy Std |   Precision Mean |   Precision Std |   Recall Mean |   Recall Std |   F1 Mean |   F1 Std |   ROC Mean |   ROC Std |
|------------+-----------------+----------------+------------------+-----------------+---------------+--------------+-----------+----------+------------+-----------|
| GaussianNB |          0.6052 |        0.01968 |           0.6481 |         0.04702 |        0.3991 |      0.01794 |    0.4933 |  0.02145 |     0.5983 |   0.01911 |
+------------+-----------------+----------------+------------------+-----------------+---------------+--------------+-----------+----------+------------+-----------+
+------------------------+-----------------+----------------+------------------+-----------------+---------------+--------------+-----------+----------+------------+-----

In [11]:
# # Hiperparameters with GridSearchCV for Models: GaussianNB, Decision Tree, KNN and Random Forest
# from sklearn.model_selection import GridSearchCV


# # GaussianNB
# params_gnb = {'var_smoothing': np.logspace(0, -9, num=100)}
# grid_gnb = GridSearchCV(estimator=GaussianNB(), param_grid=params_gnb, cv=5, verbose=1, n_jobs=-1)
# grid_gnb.fit(X_train, y_train)
# print(f"Model: GaussianNB | Best params: {grid_gnb.best_params_}\n")

# # Decision Tree
# params_dt = {'criterion': ['gini', 'entropy'],
#                 'max_depth': [3, 5, 7, 9, 11, 13, 15],
#                 'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9],
#                 'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9]}
# grid_dt = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=params_dt, cv=5, verbose=1, n_jobs=-1)
# grid_dt.fit(X_train, y_train)
# print(f"Model: DT | Best params: {grid_dt.best_params_}\n")

# # KNN
# params_knn = {'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
#                 'weights': ['uniform', 'distance'],
#                 'metric': ['euclidean', 'manhattan', 'minkowski']}
# grid_knn = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=params_knn, cv=5, verbose=1, n_jobs=-1)
# grid_knn.fit(X_train, y_train)
# print(f"Model: KNN | Best params: {grid_knn.best_params_}\n")

# # Random Forest
# params_rf = {'n_estimators': [100, 200, 300, 400, 500],
#              'criterion': ['gini', 'entropy'],
#                 'max_depth': [3, 5, 7, 9, 11, 13, 15],
#                 'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9],
#                 'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9]}
# grid_rf = GridSearchCV(estimator=RandomForestClassifier(), param_grid=params_rf, cv=5, verbose=1, n_jobs=-1)
# grid_rf.fit(X_train, y_train)
# print(f"Model: RF | Best params: {grid_rf.best_params_}\n")