<a href="https://colab.research.google.com/github/RochaErik/AlgorithmComparison/blob/main/AlgorithmComparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [86]:
!pip install catboost
!pip install lightgbm
!pip install xgboost



In [87]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

---

# **(Unbalanced) Wine Dataset**

---

In [3]:
wine_df = pd.read_csv('/content/drive/MyDrive/DatasetSeminario/Wine/wine.data', header=None)

In [4]:
wine_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [5]:
wine_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       178 non-null    int64  
 1   1       178 non-null    float64
 2   2       178 non-null    float64
 3   3       178 non-null    float64
 4   4       178 non-null    float64
 5   5       178 non-null    int64  
 6   6       178 non-null    float64
 7   7       178 non-null    float64
 8   8       178 non-null    float64
 9   9       178 non-null    float64
 10  10      178 non-null    float64
 11  11      178 non-null    float64
 12  12      178 non-null    float64
 13  13      178 non-null    int64  
dtypes: float64(11), int64(3)
memory usage: 19.6 KB


In [6]:
wine_df.nunique()

0       3
1     126
2     133
3      79
4      63
5      53
6      97
7     132
8      39
9     101
10    132
11     78
12    122
13    121
dtype: int64

In [7]:
# Unbalanced dataset

wine_df[0].value_counts()

2    71
1    59
3    48
Name: 0, dtype: int64

In [8]:
wine_df.isna().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
dtype: int64

In [9]:
X = wine_df.iloc[:, 1:]
y = wine_df.iloc[:, 0]

In [10]:
X.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [11]:
X.shape

(178, 13)

In [12]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: 0, dtype: int64

In [13]:
y.shape

(178,)

In [14]:
# Since version 1.3.2 XGBoost needs target columns to start with 0 value

le = LabelEncoder()
y = le.fit_transform(y)

In [88]:
names = [
          'AdaBoost',
          'CatBoost',
          'LightGBM',
          'XGBoost'
        ]

classifiers = [
                AdaBoostClassifier(),
                CatBoostClassifier(silent=True),
                LGBMClassifier(verbosity=-1),
                XGBClassifier()
              ]

In [89]:
rkf = RepeatedKFold(n_splits=10, n_repeats=10, random_state=42)

In [17]:
# wine_scores_mean = []
# wine_scores_std = []

# for name, clf in zip(names, classifiers):
#   results = cross_val_score(clf, X, y, cv=rkf)
#   wine_scores_mean.append(results.mean()*100)
#   wine_scores_std.append(results.std()*100)
#   print(f'--------- {name} on Wine Dataset ---------')
#   print(results)
#   print('Accuracy: %.2f%% (%.2f%%)' % (results.mean()*100, results.std()*100))
#   print('------------------------------')

In [18]:
# wine_scores_mean

In [19]:
# wine_scores_std

In [90]:
Algo_results = pd.DataFrame()
Algo_results['Name'] = names

In [21]:
# Algo_results['Wine'] = wine_scores_mean

In [22]:
# Algo_results

---

# **(Unbalanced) Breast Cancer Dataset**

---

In [23]:
breast_cancer_df = pd.read_csv('/content/drive/MyDrive/DatasetSeminario/BreastCancer/breastCancer.data', header=None, na_values='?')

In [24]:
breast_cancer_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [25]:
print(breast_cancer_df.shape)

(286, 10)


In [26]:
column_names = ['Class', 'age', 'menopause', 'tumor_size', 'inv_nodes', 'node_caps', 'deg_malig', 'breast', 'breast_quad', 'irradiat']
breast_cancer_df.columns = column_names

In [27]:
breast_cancer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Class        286 non-null    object
 1   age          286 non-null    object
 2   menopause    286 non-null    object
 3   tumor_size   286 non-null    object
 4   inv_nodes    286 non-null    object
 5   node_caps    278 non-null    object
 6   deg_malig    286 non-null    int64 
 7   breast       286 non-null    object
 8   breast_quad  285 non-null    object
 9   irradiat     286 non-null    object
dtypes: int64(1), object(9)
memory usage: 22.5+ KB


In [28]:
breast_cancer_df.nunique()

Class           2
age             6
menopause       3
tumor_size     11
inv_nodes       7
node_caps       2
deg_malig       3
breast          2
breast_quad     5
irradiat        2
dtype: int64

In [29]:
# Unbalanced dataset

breast_cancer_df['Class'].value_counts()

no-recurrence-events    201
recurrence-events        85
Name: Class, dtype: int64

In [30]:
breast_cancer_df.isna().sum()

Class          0
age            0
menopause      0
tumor_size     0
inv_nodes      0
node_caps      8
deg_malig      0
breast         0
breast_quad    1
irradiat       0
dtype: int64

In [31]:
# Checking the proportion of NaN values

missing_count = breast_cancer_df['node_caps'].isna().sum()
total_count = len(breast_cancer_df['node_caps'])
missing_ratio = missing_count / total_count
missing_ratio

0.027972027972027972

In [32]:
# The proportion of NaN values was low so we opt to remove the rows

breast_cancer_df.dropna(inplace=True)

In [33]:
breast_cancer_df.isna().sum()

Class          0
age            0
menopause      0
tumor_size     0
inv_nodes      0
node_caps      0
deg_malig      0
breast         0
breast_quad    0
irradiat       0
dtype: int64

In [34]:
age_dummies = {
                '10-19': 0,
                '20-29': 1,
                '30-39': 2,
                '40-49': 3,
                '50-59': 4,
                '60-69': 5,
                '70-79': 6,
                '80-89': 7,
                '90-99': 8
              }
breast_cancer_df['age'].replace(age_dummies, inplace=True)

In [35]:
breast_cancer_df['age']

0      2
1      3
2      3
3      5
4      3
      ..
281    2
282    2
283    5
284    3
285    4
Name: age, Length: 277, dtype: int64

In [36]:
size_dummies = {
                '0-4': 0,
                '5-9': 1,
                '10-14': 2,
                '15-19': 3,
                '20-24': 4,
                '25-29': 5,
                '30-34': 6,
                '35-39': 7,
                '40-44': 8,
                '45-49': 9,
                '50-54': 10,
                '55-59': 11
                }
breast_cancer_df['tumor_size'].replace(size_dummies, inplace=True)

In [37]:
breast_cancer_df['tumor_size']

0      6
1      4
2      4
3      3
4      0
      ..
281    6
282    4
283    4
284    6
285    6
Name: tumor_size, Length: 277, dtype: int64

In [38]:
invnodes_dummies = {
                '0-2': 0,
                '3-5': 1,
                '6-8': 2,
                '9-11': 3,
                '12-14': 4,
                '15-17': 5,
                '18-20': 6,
                '21-23': 7,
                '24-26': 8,
                '27-29': 9,
                '30-32': 10,
                '33-35': 11,
                '36-39': 12,
              }
breast_cancer_df['inv_nodes'].replace(invnodes_dummies, inplace=True)

In [39]:
breast_cancer_df['inv_nodes']

0      0
1      0
2      0
3      0
4      0
      ..
281    0
282    0
283    0
284    1
285    1
Name: inv_nodes, Length: 277, dtype: int64

In [40]:
# Important: This isnt recommended for CatBoost. One-hot encoding during preprocessing affects both the training speed and
# the resulting quality.

dummies = pd.get_dummies(breast_cancer_df[['menopause', 'node_caps', 'breast', 'breast_quad', 'irradiat']], drop_first=True)
breast_cancer_df = pd.concat([breast_cancer_df, dummies], axis=1)
breast_cancer_df.head(5)

Unnamed: 0,Class,age,menopause,tumor_size,inv_nodes,node_caps,deg_malig,breast,breast_quad,irradiat,menopause_lt40,menopause_premeno,node_caps_yes,breast_right,breast_quad_left_low,breast_quad_left_up,breast_quad_right_low,breast_quad_right_up,irradiat_yes
0,no-recurrence-events,2,premeno,6,0,no,3,left,left_low,no,0,1,0,0,1,0,0,0,0
1,no-recurrence-events,3,premeno,4,0,no,2,right,right_up,no,0,1,0,1,0,0,0,1,0
2,no-recurrence-events,3,premeno,4,0,no,2,left,left_low,no,0,1,0,0,1,0,0,0,0
3,no-recurrence-events,5,ge40,3,0,no,2,right,left_up,no,0,0,0,1,0,1,0,0,0
4,no-recurrence-events,3,premeno,0,0,no,2,right,right_low,no,0,1,0,1,0,0,1,0,0


In [41]:
breast_cancer_df.drop(['menopause', 'node_caps', 'breast', 'breast_quad', 'irradiat'], axis=1, inplace=True)

In [42]:
breast_cancer_df.head(5)

Unnamed: 0,Class,age,tumor_size,inv_nodes,deg_malig,menopause_lt40,menopause_premeno,node_caps_yes,breast_right,breast_quad_left_low,breast_quad_left_up,breast_quad_right_low,breast_quad_right_up,irradiat_yes
0,no-recurrence-events,2,6,0,3,0,1,0,0,1,0,0,0,0
1,no-recurrence-events,3,4,0,2,0,1,0,1,0,0,0,1,0
2,no-recurrence-events,3,4,0,2,0,1,0,0,1,0,0,0,0
3,no-recurrence-events,5,3,0,2,0,0,0,1,0,1,0,0,0
4,no-recurrence-events,3,0,0,2,0,1,0,1,0,0,1,0,0


In [43]:
X = breast_cancer_df.iloc[:, 1:]
y = breast_cancer_df.iloc[:, 0]

In [44]:
X.head()

Unnamed: 0,age,tumor_size,inv_nodes,deg_malig,menopause_lt40,menopause_premeno,node_caps_yes,breast_right,breast_quad_left_low,breast_quad_left_up,breast_quad_right_low,breast_quad_right_up,irradiat_yes
0,2,6,0,3,0,1,0,0,1,0,0,0,0
1,3,4,0,2,0,1,0,1,0,0,0,1,0
2,3,4,0,2,0,1,0,0,1,0,0,0,0
3,5,3,0,2,0,0,0,1,0,1,0,0,0
4,3,0,0,2,0,1,0,1,0,0,1,0,0


In [45]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 277 entries, 0 to 285
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   age                    277 non-null    int64
 1   tumor_size             277 non-null    int64
 2   inv_nodes              277 non-null    int64
 3   deg_malig              277 non-null    int64
 4   menopause_lt40         277 non-null    uint8
 5   menopause_premeno      277 non-null    uint8
 6   node_caps_yes          277 non-null    uint8
 7   breast_right           277 non-null    uint8
 8   breast_quad_left_low   277 non-null    uint8
 9   breast_quad_left_up    277 non-null    uint8
 10  breast_quad_right_low  277 non-null    uint8
 11  breast_quad_right_up   277 non-null    uint8
 12  irradiat_yes           277 non-null    uint8
dtypes: int64(4), uint8(9)
memory usage: 13.3 KB


In [46]:
y.head()

0    no-recurrence-events
1    no-recurrence-events
2    no-recurrence-events
3    no-recurrence-events
4    no-recurrence-events
Name: Class, dtype: object

In [47]:
# Since version 1.3.2 XGBoost needs target columns to start with 0 value

le = LabelEncoder()
y = le.fit_transform(y)

In [48]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [49]:
# breast_cancer_mean = []
# breast_cancer_std = []

# for name, clf in zip(names, classifiers):
#   results = cross_val_score(clf, X, y, cv=rkf)
#   breast_cancer_mean.append(results.mean()*100)
#   breast_cancer_std.append(results.std()*100)
#   print(f'--------- {name} on Breast Cancer Dataset ---------')
#   print(results)
#   print('Accuracy: %.2f%% (%.2f%%)' % (results.mean()*100, results.std()*100))
#   print('------------------------------')

In [50]:
# breast_cancer_mean

NameError: ignored

In [None]:
# breast_cancer_std

In [None]:
# Algo_results['Breast_Cancer'] = breast_cancer_mean

---

# **Sonar Dataset** #

---

In [51]:
sonar_df = pd.read_csv('/content/drive/MyDrive/DatasetSeminario/Sonar/Sonar.csv', header=None)

In [52]:
sonar_df.shape

(208, 61)

In [53]:
sonar_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,0.02,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0027,0.0065,0.0159,0.0072,0.0167,0.018,0.0084,0.009,0.0032,R
1,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0084,0.0089,0.0048,0.0094,0.0191,0.014,0.0049,0.0052,0.0044,R
2,0.0262,0.0582,0.1099,0.1083,0.0974,0.228,0.2431,0.3771,0.5598,0.6194,...,0.0232,0.0166,0.0095,0.018,0.0244,0.0316,0.0164,0.0095,0.0078,R
3,0.01,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0121,0.0036,0.015,0.0085,0.0073,0.005,0.0044,0.004,0.0117,R
4,0.0762,0.0666,0.0481,0.0394,0.059,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0031,0.0054,0.0105,0.011,0.0015,0.0072,0.0048,0.0107,0.0094,R


In [54]:
sonar_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208 entries, 0 to 207
Data columns (total 61 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       208 non-null    float64
 1   1       208 non-null    float64
 2   2       208 non-null    float64
 3   3       208 non-null    float64
 4   4       208 non-null    float64
 5   5       208 non-null    float64
 6   6       208 non-null    float64
 7   7       208 non-null    float64
 8   8       208 non-null    float64
 9   9       208 non-null    float64
 10  10      208 non-null    float64
 11  11      208 non-null    float64
 12  12      208 non-null    float64
 13  13      208 non-null    float64
 14  14      208 non-null    float64
 15  15      208 non-null    float64
 16  16      208 non-null    float64
 17  17      208 non-null    float64
 18  18      208 non-null    float64
 19  19      208 non-null    float64
 20  20      208 non-null    float64
 21  21      208 non-null    float64
 22  22

In [55]:
# A little unbalanced dataset

sonar_df[60].value_counts()

M    111
R     97
Name: 60, dtype: int64

In [56]:
X = sonar_df.iloc[:, :-1]
y = sonar_df.iloc[:, -1]

In [57]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,59
0,0.02,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0232,0.0027,0.0065,0.0159,0.0072,0.0167,0.018,0.0084,0.009,0.0032
1,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0125,0.0084,0.0089,0.0048,0.0094,0.0191,0.014,0.0049,0.0052,0.0044
2,0.0262,0.0582,0.1099,0.1083,0.0974,0.228,0.2431,0.3771,0.5598,0.6194,...,0.0033,0.0232,0.0166,0.0095,0.018,0.0244,0.0316,0.0164,0.0095,0.0078
3,0.01,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0241,0.0121,0.0036,0.015,0.0085,0.0073,0.005,0.0044,0.004,0.0117
4,0.0762,0.0666,0.0481,0.0394,0.059,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0156,0.0031,0.0054,0.0105,0.011,0.0015,0.0072,0.0048,0.0107,0.0094


In [58]:
y.head()

0    R
1    R
2    R
3    R
4    R
Name: 60, dtype: object

In [59]:
# Since version 1.3.2 XGBoost needs target columns to start with 0 value

le = LabelEncoder()
y = le.fit_transform(y)

In [60]:
# sonar_mean = []
# sonar_std = []

# for name, clf in zip(names, classifiers):
#   results = cross_val_score(clf, X, y, cv=rkf)
#   sonar_mean.append(results.mean()*100)
#   sonar_std.append(results.std()*100)
#   print(f'--------- {name} on Sonar Dataset ---------')
#   print(results)
#   print('Accuracy: %.2f%% (%.2f%%)' % (results.mean()*100, results.std()*100))
#   print('------------------------------')

In [61]:
# sonar_mean

NameError: ignored

In [None]:
# sonar_std

In [None]:
Algo_results['Sonar'] = sonar_mean

In [None]:
Algo_results

---

# **(Unbalanced) Ionosphere Dataset**

---

In [100]:
ionosphere_df = pd.read_csv('/content/drive/MyDrive/DatasetSeminario/Ionosphere/ionosphere.data', header=None)

In [101]:
ionosphere_df.shape

(351, 35)

In [102]:
ionosphere_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,1,0,0.99539,-0.05889,0.85243,0.02306,0.83398,-0.37708,1.0,0.0376,...,-0.51171,0.41078,-0.46168,0.21266,-0.3409,0.42267,-0.54487,0.18641,-0.453,g
1,1,0,1.0,-0.18829,0.93035,-0.36156,-0.10868,-0.93597,1.0,-0.04549,...,-0.26569,-0.20468,-0.18401,-0.1904,-0.11593,-0.16626,-0.06288,-0.13738,-0.02447,b
2,1,0,1.0,-0.03365,1.0,0.00485,1.0,-0.12062,0.88965,0.01198,...,-0.4022,0.58984,-0.22145,0.431,-0.17365,0.60436,-0.2418,0.56045,-0.38238,g
3,1,0,1.0,-0.45161,1.0,1.0,0.71216,-1.0,0.0,0.0,...,0.90695,0.51613,1.0,1.0,-0.20099,0.25682,1.0,-0.32382,1.0,b
4,1,0,1.0,-0.02401,0.9414,0.06531,0.92106,-0.23255,0.77152,-0.16399,...,-0.65158,0.1329,-0.53206,0.02431,-0.62197,-0.05707,-0.59573,-0.04608,-0.65697,g


In [103]:
ionosphere_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 351 entries, 0 to 350
Data columns (total 35 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       351 non-null    int64  
 1   1       351 non-null    int64  
 2   2       351 non-null    float64
 3   3       351 non-null    float64
 4   4       351 non-null    float64
 5   5       351 non-null    float64
 6   6       351 non-null    float64
 7   7       351 non-null    float64
 8   8       351 non-null    float64
 9   9       351 non-null    float64
 10  10      351 non-null    float64
 11  11      351 non-null    float64
 12  12      351 non-null    float64
 13  13      351 non-null    float64
 14  14      351 non-null    float64
 15  15      351 non-null    float64
 16  16      351 non-null    float64
 17  17      351 non-null    float64
 18  18      351 non-null    float64
 19  19      351 non-null    float64
 20  20      351 non-null    float64
 21  21      351 non-null    float64
 22  22

In [104]:
ionosphere_df.nunique()

0       2
1       1
2     219
3     269
4     204
5     259
6     231
7     260
8     244
9     267
10    246
11    269
12    238
13    266
14    234
15    270
16    254
17    280
18    254
19    266
20    248
21    265
22    248
23    264
24    256
25    273
26    256
27    281
28    244
29    266
30    243
31    263
32    245
33    263
34      2
dtype: int64

In [105]:
ionosphere_df.isna().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
30    0
31    0
32    0
33    0
34    0
dtype: int64

In [106]:
# This is a unbalanced dataset

ionosphere_df[34].value_counts()

g    225
b    126
Name: 34, dtype: int64

In [154]:
X = ionosphere_df.iloc[:, :-1]
y = ionosphere_df.iloc[:, -1]

NameError: ignored

In [78]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
0,1,0,0.99539,-0.05889,0.85243,0.02306,0.83398,-0.37708,1.0,0.0376,...,0.56811,-0.51171,0.41078,-0.46168,0.21266,-0.3409,0.42267,-0.54487,0.18641,-0.453
1,1,0,1.0,-0.18829,0.93035,-0.36156,-0.10868,-0.93597,1.0,-0.04549,...,-0.20332,-0.26569,-0.20468,-0.18401,-0.1904,-0.11593,-0.16626,-0.06288,-0.13738,-0.02447
2,1,0,1.0,-0.03365,1.0,0.00485,1.0,-0.12062,0.88965,0.01198,...,0.57528,-0.4022,0.58984,-0.22145,0.431,-0.17365,0.60436,-0.2418,0.56045,-0.38238
3,1,0,1.0,-0.45161,1.0,1.0,0.71216,-1.0,0.0,0.0,...,1.0,0.90695,0.51613,1.0,1.0,-0.20099,0.25682,1.0,-0.32382,1.0
4,1,0,1.0,-0.02401,0.9414,0.06531,0.92106,-0.23255,0.77152,-0.16399,...,0.03286,-0.65158,0.1329,-0.53206,0.02431,-0.62197,-0.05707,-0.59573,-0.04608,-0.65697


In [79]:
y.head()

0    g
1    b
2    g
3    b
4    g
Name: 34, dtype: object

In [80]:
# Since version 1.3.2 XGBoost needs target columns to start with 0 value

le = LabelEncoder()
y = le.fit_transform(y)

In [81]:
# ionosphere_mean = []
# ionosphere_std = []

# for name, clf in zip(names, classifiers):
#   results = cross_val_score(clf, X, y, cv=rkf)
#   ionosphere_mean.append(results.mean()*100)
#   ionosphere_std.append(results.std()*100)
#   print(f'--------- {name} on Ionosphere Dataset ---------')
#   print(results)
#   print('Accuracy: %.2f%% (%.2f%%)' % (results.mean()*100, results.std()*100))
#   print('------------------------------')

--------- AdaBoost on Ionosphere Dataset ---------
[0.88888889 0.91428571 0.97142857 0.91428571 0.97142857 0.94285714
 0.94285714 1.         0.88571429 0.94285714 0.88888889 0.85714286
 0.97142857 1.         0.97142857 0.94285714 0.94285714 0.94285714
 0.85714286 0.91428571 0.94444444 0.97142857 0.88571429 0.88571429
 0.97142857 0.97142857 0.91428571 0.94285714 0.91428571 0.94285714
 0.91666667 0.94285714 0.97142857 0.88571429 0.85714286 0.94285714
 1.         0.97142857 0.88571429 0.97142857 0.91666667 0.97142857
 0.97142857 0.85714286 0.85714286 0.91428571 0.94285714 0.97142857
 0.97142857 0.97142857 0.91666667 0.91428571 0.94285714 1.
 0.97142857 0.91428571 0.94285714 0.85714286 0.97142857 0.88571429
 0.86111111 0.91428571 0.94285714 0.88571429 0.91428571 0.91428571
 0.91428571 1.         1.         0.91428571 0.80555556 1.
 0.97142857 0.85714286 0.94285714 0.91428571 0.97142857 0.85714286
 1.         0.91428571 0.97222222 0.97142857 0.88571429 0.91428571
 0.85714286 0.94285714 0.94

In [82]:
ionosphere_mean

[93.02777777777777, 93.4531746031746, 93.7063492063492, 92.43015873015872]

In [83]:
ionosphere_std

[4.4762520399909125, 4.589216591001476, 4.414603619768994, 5.314317716237489]

In [84]:
Algo_results['Ionosphere'] = ionosphere_mean

In [85]:
Algo_results

Unnamed: 0,Name,Ionosphere
0,AdaBoost,93.027778
1,CatBoost,93.453175
2,LightGBM,93.706349
3,XGBoost,92.430159


---

# **(Unbalanced) Tic-Tac-Toe Dataset**

---

In [129]:
tictactoe_df = pd.read_csv('/content/drive/MyDrive/DatasetSeminario/TicTacToe/TicTacToe.data', header=None)

In [130]:
tictactoe_df.shape

(958, 10)

In [131]:
tictactoe_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,x,x,x,x,o,o,x,o,o,positive
1,x,x,x,x,o,o,o,x,o,positive
2,x,x,x,x,o,o,o,o,x,positive
3,x,x,x,x,o,o,o,b,b,positive
4,x,x,x,x,o,o,b,o,b,positive


In [132]:
tictactoe_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 958 entries, 0 to 957
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       958 non-null    object
 1   1       958 non-null    object
 2   2       958 non-null    object
 3   3       958 non-null    object
 4   4       958 non-null    object
 5   5       958 non-null    object
 6   6       958 non-null    object
 7   7       958 non-null    object
 8   8       958 non-null    object
 9   9       958 non-null    object
dtypes: object(10)
memory usage: 75.0+ KB


In [133]:
tictactoe_df.nunique()

0    3
1    3
2    3
3    3
4    3
5    3
6    3
7    3
8    3
9    2
dtype: int64

In [134]:
tictactoe_df[9].value_counts()

positive    626
negative    332
Name: 9, dtype: int64

In [135]:
dummies = {
            'x': 0,
            'o': 1,
            'b': 2,
          }
tictactoe_df = tictactoe_df.iloc[:, 0: 9].replace(dummies)

In [136]:
tictactoe_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0,0,0,0,1,1,0,1,1
1,0,0,0,0,1,1,1,0,1
2,0,0,0,0,1,1,1,1,0
3,0,0,0,0,1,1,1,2,2
4,0,0,0,0,1,1,2,1,2


In [138]:
X = tictactoe_df.iloc[:, :-1]
y = tictactoe_df.iloc[:, -1]

In [139]:
# Since version 1.3.2 XGBoost needs target columns to start with 0 value

le = LabelEncoder()
y = le.fit_transform(y)

In [140]:
tictactoe_mean = []
tictactoe_std = []

for name, clf in zip(names, classifiers):
  results = cross_val_score(clf, X, y, cv=rkf)
  tictactoe_mean.append(results.mean()*100)
  tictactoe_std.append(results.std()*100)
  print(f'--------- {name} on Tictactoe Dataset ---------')
  print(results)
  print('Accuracy: %.2f%% (%.2f%%)' % (results.mean()*100, results.std()*100))
  print('------------------------------')

--------- AdaBoost on Tictactoe Dataset ---------
[0.6875     0.78125    0.67708333 0.71875    0.75       0.73958333
 0.66666667 0.75       0.75789474 0.70526316 0.8125     0.73958333
 0.80208333 0.64583333 0.80208333 0.69791667 0.66666667 0.82291667
 0.63157895 0.70526316 0.77083333 0.66666667 0.67708333 0.72916667
 0.77083333 0.78125    0.71875    0.79166667 0.76842105 0.69473684
 0.71875    0.72916667 0.75       0.79166667 0.79166667 0.75
 0.75       0.63541667 0.83157895 0.74736842 0.79166667 0.75
 0.70833333 0.76041667 0.6875     0.80208333 0.78125    0.73958333
 0.76842105 0.62105263 0.67708333 0.625      0.79166667 0.6875
 0.77083333 0.625      0.76041667 0.85416667 0.77894737 0.77894737
 0.78125    0.64583333 0.70833333 0.72916667 0.73958333 0.69791667
 0.76041667 0.875      0.71578947 0.77894737 0.76041667 0.71875
 0.67708333 0.75       0.67708333 0.78125    0.70833333 0.78125
 0.76842105 0.73684211 0.72916667 0.76041667 0.77083333 0.71875
 0.65625    0.77083333 0.70833333 0.7

In [141]:
tictactoe_mean

[73.7074561403509, 51.491228070175445, 55.961513157894736, 45.98881578947368]

In [142]:
tictactoe_std

[5.225564359226885, 5.42277316668588, 5.240957540685767, 5.121433257470645]

In [None]:
Algo_results['TicTacToe'] = tictactoe_mean

In [None]:
Algo_results

---

# **(Unbalanced) Bupa**

---

In [146]:
bupa_df = pd.read_csv('/content/drive/MyDrive/DatasetSeminario/Bupa/Bupa.data', header=None)

In [147]:
bupa_df.shape

(345, 7)

In [148]:
bupa_df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,85,92,45,27,31,0.0,1
1,85,64,59,32,23,0.0,2
2,86,54,33,16,54,0.0,2
3,91,78,34,24,36,0.0,2
4,87,70,12,28,10,0.0,2


In [149]:
bupa_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 345 entries, 0 to 344
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       345 non-null    int64  
 1   1       345 non-null    int64  
 2   2       345 non-null    int64  
 3   3       345 non-null    int64  
 4   4       345 non-null    int64  
 5   5       345 non-null    float64
 6   6       345 non-null    int64  
dtypes: float64(1), int64(6)
memory usage: 19.0 KB


In [151]:
bupa_df.nunique()

0    26
1    78
2    67
3    47
4    94
5    16
6     2
dtype: int64

In [152]:
bupa_df.isna().sum()

0    0
1    0
2    0
3    0
4    0
5    0
6    0
dtype: int64

In [153]:
# Unbalanced dataset

bupa_df[6].value_counts()

2    200
1    145
Name: 6, dtype: int64

In [155]:
X = bupa_df.iloc[:, :-1]
y = bupa_df.iloc[:, -1]

In [156]:
X.head()

Unnamed: 0,0,1,2,3,4,5
0,85,92,45,27,31,0.0
1,85,64,59,32,23,0.0
2,86,54,33,16,54,0.0
3,91,78,34,24,36,0.0
4,87,70,12,28,10,0.0


In [157]:
y.head()

0    1
1    2
2    2
3    2
4    2
Name: 6, dtype: int64

In [158]:
# Since version 1.3.2 XGBoost needs target columns to start with 0 value

le = LabelEncoder()
y = le.fit_transform(y)

In [159]:
bupa_mean = []
bupa_std = []

for name, clf in zip(names, classifiers):
  results = cross_val_score(clf, X, y, cv=rkf)
  bupa_mean.append(results.mean()*100)
  bupa_std.append(results.std()*100)
  print(f'--------- {name} on Bupa Dataset ---------')
  print(results)
  print('Accuracy: %.2f%% (%.2f%%)' % (results.mean()*100, results.std()*100))
  print('------------------------------')

--------- AdaBoost on Bupa Dataset ---------
[0.8        0.8        0.65714286 0.74285714 0.71428571 0.64705882
 0.67647059 0.79411765 0.70588235 0.64705882 0.71428571 0.77142857
 0.71428571 0.65714286 0.65714286 0.79411765 0.55882353 0.76470588
 0.70588235 0.82352941 0.74285714 0.62857143 0.71428571 0.74285714
 0.68571429 0.73529412 0.79411765 0.82352941 0.82352941 0.58823529
 0.62857143 0.74285714 0.8        0.62857143 0.71428571 0.79411765
 0.70588235 0.67647059 0.67647059 0.55882353 0.74285714 0.77142857
 0.74285714 0.71428571 0.77142857 0.79411765 0.70588235 0.73529412
 0.73529412 0.70588235 0.68571429 0.8        0.62857143 0.82857143
 0.71428571 0.67647059 0.64705882 0.70588235 0.85294118 0.73529412
 0.62857143 0.62857143 0.77142857 0.68571429 0.8        0.61764706
 0.61764706 0.85294118 0.76470588 0.76470588 0.71428571 0.68571429
 0.82857143 0.74285714 0.71428571 0.76470588 0.70588235 0.85294118
 0.61764706 0.70588235 0.88571429 0.6        0.74285714 0.71428571
 0.74285714 0.735

In [160]:
bupa_mean

[72.25462184873949, 74.40336134453781, 71.8016806722689, 70.57983193277309]

In [161]:
bupa_std

[7.026728304847883, 6.450640429588332, 6.215299205426823, 6.773062326505143]

In [162]:
Algo_results['Bupa'] = bupa_mean

In [163]:
Algo_results

Unnamed: 0,Name,Bupa
0,AdaBoost,72.254622
1,CatBoost,74.403361
2,LightGBM,71.801681
3,XGBoost,70.579832
