<a href="https://colab.research.google.com/github/RochaErik/AlgorithmComparison/blob/main/AlgorithmComparison3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

---

# **Comparison between the balanced and unbalanced dataset algorithm performance**

---

In [None]:
!pip install catboost
!pip install lightgbm
!pip install xgboost

Collecting catboost
  Downloading catboost-1.2.1-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.1


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as stats
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [None]:
unbalanced_df = pd.read_csv('/content/drive/MyDrive/DatasetSeminario/Algo_results/AlgoResults.csv')
balanced_df = pd.read_csv('/content/drive/MyDrive/DatasetSeminario/Algo_results/StratAlgoResults.csv')

In [None]:
print(unbalanced_df.shape)
print(balanced_df.shape)

(9, 5)
(9, 5)


In [None]:
datasets = ['Wine', 'Breast Cancer', 'Sonar', 'Ionosphere', 'TicTacToe', 'Bupa', 'Pima', 'Heart', 'Liver']

In [None]:
for i, name in enumerate(datasets):
  # Creating two lists of paired data
  before = unbalanced_df.iloc[i, :]
  after = balanced_df.iloc[i, :]

  # Performing the Wilcoxon signed-rank test
  statistic, p_value = stats.wilcoxon(before, after)

  print (f'Comparison between {name} models results')

  # Printing the test statistic and p-value
  print(f"Test Statistic: {statistic}")
  print(f"P-value: {p_value}")

  # Interpreting the results
  alpha = 0.05
  if p_value < alpha:
      print(f'Reject the null hypothesis: There is a significant difference between {name} models.')
  else:
      print(f'Fail to reject the null hypothesis: There is no significant difference between {name} models.')
  print ('- - - - - - - - - - - - - - - - - - - - -')

Comparison between Wine models results
Test Statistic: 7.0
P-value: 1.0
Fail to reject the null hypothesis: There is no significant difference between Wine models.
- - - - - - - - - - - - - - - - - - - - -
Comparison between Breast Cancer models results
Test Statistic: 6.0
P-value: 0.8125
Fail to reject the null hypothesis: There is no significant difference between Breast Cancer models.
- - - - - - - - - - - - - - - - - - - - -
Comparison between Sonar models results
Test Statistic: 1.0
P-value: 0.125
Fail to reject the null hypothesis: There is no significant difference between Sonar models.
- - - - - - - - - - - - - - - - - - - - -
Comparison between Ionosphere models results
Test Statistic: 1.0
P-value: 0.125
Fail to reject the null hypothesis: There is no significant difference between Ionosphere models.
- - - - - - - - - - - - - - - - - - - - -
Comparison between TicTacToe models results
Test Statistic: 3.0
P-value: 0.3125
Fail to reject the null hypothesis: There is no significa

---

# **Evaluating algorithms with hyperparameter tuning**

---

In [None]:
pip install hyperopt



In [None]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# **Wine Dataset**

In [None]:
wine_df = pd.read_csv('/content/drive/MyDrive/DatasetSeminario/Wine/wine.data', header=None)

In [None]:
X = wine_df.iloc[:, 1:]
y = wine_df.iloc[:, 0]

In [None]:
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
import hyperopt.pyll
from hyperopt.pyll import scope

# Define the hyperparameter search space for each algorithm

# def optimize_adaboost(params):
#     clf = AdaBoostClassifier(**params)
#     clf.fit(X_train, y_train)
#     y_pred = clf.predict(X_test)
#     return -accuracy_score(y_test, y_pred)

def optimize_adaboost(params):
    estimator_params = params['estimator']
    estimator = DecisionTreeClassifier(**estimator_params)

    clf = AdaBoostClassifier(estimator=estimator, n_estimators=params['n_estimators'], learning_rate=params['learning_rate'], random_state=params['random_state'])
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return -accuracy_score(y_test, y_pred)

def optimize_gradientboost(params):
    clf = GradientBoostingClassifier(**params)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return -accuracy_score(y_test, y_pred)

def optimize_catboost(params):
    clf = CatBoostClassifier(**params)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return -accuracy_score(y_test, y_pred)

def optimize_lightgbm(params):
    clf = LGBMClassifier(**params)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return -accuracy_score(y_test, y_pred)

def optimize_xgboost(params):
    clf = XGBClassifier(**params)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return -accuracy_score(y_test, y_pred)

# Define the hyperparameter search space for each algorithm

# space_adaboost = {
#     'n_estimators': 1 + scope.int(hp.quniform('n_estimators', 5, 1500, 50)),
#     'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.1)),
#     'estimator': {
#         'max_depth': scope.int(hp.quniform('max_depth', 1, 6, 1)),  # Decision tree depth
#         'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 8, 2)),  # Min samples required to split
#         'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 5, 1)),  # Min samples required in a leaf node
#         'max_features': hp.choice('max_features', [None, 'sqrt', 'log2']),
#     },
#     'random_state': 42
# }

# space_gradientboost = {
#     'n_estimators': hp.choice('n_estimators', range(50, 200)),
#     'learning_rate': hp.loguniform('learning_rate', -3, 0),
#     'max_depth': hp.choice('max_depth', range(1, 11)),
# }

# space_catboost = {
#     'iterations': hp.choice('iterations', range(50, 200)),
#     'learning_rate': hp.loguniform('learning_rate', -3, 0),
#     'silent': True
# }

# space_lightgbm = {
#     'n_estimators': hp.choice('n_estimators', range(50, 200)),
#     'learning_rate': hp.loguniform('learning_rate', -3, 0),
#     'max_depth': hp.choice('max_depth', range(1, 11)),
#     'verbosity': -1
# }

space_lightgbm = {
    'class_weight': hp.choice('class_weight', [None, 'balanced']),
    'boosting_type': hp.choice('boosting_type', ['gbdt', 'dart', 'goss']),
    'num_leaves': scope.int(hp.quniform('num_leaves', 30, 100, 5)),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.1)),
    'min_child_samples': scope.int(hp.quniform('min_child_samples', 20, 200, 10)),
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
    'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0),
    'verbosity': -1,
    'random_state': 42
}

# space_xgboost = {
#     'n_estimators': hp.choice('n_estimators', range(50, 200)),
#     'learning_rate': hp.loguniform('learning_rate', -3, 0),
#     'max_depth': hp.choice('max_depth', range(1, 11)),
# }

# Define optimization functions and algorithm names
optimizers = [
    # (optimize_adaboost, space_adaboost, 'AdaBoost'),
    # (optimize_gradientboost, space_gradientboost, 'Gradient Boosting'),
    # (optimize_catboost, space_catboost, 'CatBoost'),
    (optimize_lightgbm, space_lightgbm, 'LightGBM'),
    # (optimize_xgboost, space_xgboost, 'XGBoost'),
]

# Perform hyperparameter tuning for each algorithm
for optimize_fn, space, algorithm_name in optimizers:
    trials = Trials()
    best = fmin(fn=optimize_fn, space=space, algo=tpe.suggest, max_evals=50, trials=trials)

    print(f"Best hyperparameters for {algorithm_name}:")
    print(best)


100%|██████████| 50/50 [00:03<00:00, 14.38trial/s, best loss: -1.0]
Best hyperparameters for LightGBM:
{'boosting_type': 1, 'class_weight': 0, 'colsample_by_tree': 0.7120419182084488, 'learning_rate': 0.06977031230342506, 'min_child_samples': 20.0, 'num_leaves': 60.0, 'reg_alpha': 0.8045671848719448, 'reg_lambda': 0.6867854908439323}


In [None]:
rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=42)

In [None]:
wine_scores = []
wine_scores_mean = []
wine_scores_std = []
model_names = []

for name, clf in zip(names, classifiers):
  results = cross_val_score(clf, X, y, cv=rskf)
  wine_scores.append(results)
  wine_scores_mean.append(results.mean()*100)
  wine_scores_std.append(results.std()*100)
  model_names.append(name)
  print(f'--------- {name} on Wine Dataset ---------')
  print(results)
  print('Accuracy: %.2f%% (%.2f%%)' % (results.mean()*100, results.std()*100))
  print('------------------------------')