# IMPORT LIBRARIES

In [176]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score # root_mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

from skopt.space import Real, Integer
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# library for hyperparameter tuning
import optuna
from skopt import BayesSearchCV
from optimizers.PSO import particle_swarm_optimization

# library for save model
import joblib

import warnings
warnings.filterwarnings('ignore')

# GET DATA

In [4]:
games = pd.read_csv("../data/games_normalized_clean.csv")

# Models

In [6]:
games_labels = games["Estimated owners"]
games = games.drop("Estimated owners", axis = 1)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(games, games_labels, test_size = 0.2, random_state= 42)

# Models

## HistGradientBoosting

In [49]:
model = HistGradientBoostingClassifier(max_iter = 100, max_depth = 100, random_state=42)
model.fit(X_train, y_train)

HistGradientBoostingClassifier(max_depth=100, random_state=42)

In [50]:
y_predict = model.predict(X_test)

In [51]:
accuracy_score(y_test, y_predict)

0.6611495216146379

## CatBoost

In [100]:
model = CatBoostClassifier(iterations=1000, 
                           depth= 10, 
                           learning_rate=0.1, 
                           loss_function='MultiClass', 
                           cat_features=[],
                           random_seed=42,
                           verbose=100)

In [101]:
model.fit(X_train, y_train)

0:	learn: 1.2708429	total: 353ms	remaining: 5m 52s
100:	learn: 0.6374149	total: 34.9s	remaining: 5m 10s
200:	learn: 0.5767463	total: 1m 17s	remaining: 5m 7s
300:	learn: 0.5279396	total: 1m 52s	remaining: 4m 21s
400:	learn: 0.4857458	total: 2m 29s	remaining: 3m 43s
500:	learn: 0.4529652	total: 3m 6s	remaining: 3m 5s
600:	learn: 0.4242904	total: 3m 40s	remaining: 2m 26s
700:	learn: 0.3981516	total: 4m 16s	remaining: 1m 49s
800:	learn: 0.3756271	total: 4m 52s	remaining: 1m 12s
900:	learn: 0.3552434	total: 5m 26s	remaining: 35.9s
999:	learn: 0.3365368	total: 6m 1s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1f8f70c7dc8>

In [102]:
y_pred = model.predict(X_test)

In [103]:
accuracy_score(y_test, y_pred)

0.6354494028912634

## Hyper-parameter Optimizer

## Bayesian-Search

### Gaussian Process

#### HistGradientBoosting

In [114]:
param_space = {
    'learning_rate': Real(0.01, 0.2),
    'max_iter': Integer(50, 500),
    'max_depth': Integer(3, 15),
    'min_samples_leaf': Integer(1, 20),
}

In [118]:
bayes_search = BayesSearchCV(
    estimator = HistGradientBoostingClassifier(),
    search_spaces = param_space,
    n_iter = 32, 
    scoring = 'accuracy',
    cv = 5,
    n_jobs = 4,
    verbose = 0,
    random_state = 42,
    optimizer_kwargs = {'base_estimator': 'GP'}
)

In [119]:
bayes_search.fit(X_train, y_train)

BayesSearchCV(cv=5, estimator=HistGradientBoostingClassifier(), n_iter=32,
              n_jobs=4, optimizer_kwargs={'base_estimator': 'GP'},
              random_state=42, scoring='accuracy',
              search_spaces={'learning_rate': Real(low=0.01, high=0.2, prior='uniform', transform='normalize'),
                             'max_depth': Integer(low=3, high=15, prior='uniform', transform='normalize'),
                             'max_iter': Integer(low=50, high=500, prior='uniform', transform='normalize'),
                             'min_samples_leaf': Integer(low=1, high=20, prior='uniform', transform='normalize')})

In [120]:
# Best parameters
print("Best score:", bayes_search.best_score_)
print("Best parameters:", bayes_search.best_params_)

Best score: 0.6560683406962067
Best parameters: OrderedDict([('learning_rate', 0.01), ('max_depth', 5), ('max_iter', 500), ('min_samples_leaf', 5)])


In [121]:
# Evaluar el conjunto de datos de prueba
print("Test accuracy:", bayes_search.score(X_test, y_test))

Test accuracy: 0.6631049654305469


In [128]:
joblib.dump(bayes_search, './models/bayes_search_hist_gradient_boosting.joblib')

['./models/bayes_search_hist_gradient_boosting.joblib']

### Tree Structure Parzen

#### HistGradientBoosting

In [None]:
def objective(trial):
    max_iter = trial.suggest_int('max_iter', 100, 500)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.1)
    max_depth = trial.suggest_int('max_depth', 3, 15) 
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)
    max_bins = trial.suggest_int('max_bins', 8, 64)

    model = HistGradientBoostingClassifier(
        max_iter=max_iter,
        learning_rate=learning_rate,
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf,
        max_bins=max_bins,
        random_state=42
    )
    
    score = cross_val_score(model, games, games_labels, cv=5, scoring='accuracy')
    
    return np.mean(score)

In [147]:
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler())

[I 2024-11-13 16:58:59,664] A new study created in memory with name: no-name-5fe600e1-4e86-4c76-9c27-cab9a1b1fbe1


In [148]:
# Ready to optimize
study.optimize(objective, n_trials=50)

[I 2024-11-13 17:00:24,036] Trial 0 finished with value: 0.5844924863272235 and parameters: {'max_iter': 198, 'learning_rate': 0.06885694936128044, 'max_depth': 4, 'min_samples_leaf': 1, 'max_bins': 58}. Best is trial 0 with value: 0.5844924863272235.
[I 2024-11-13 17:01:31,796] Trial 1 finished with value: 0.6216345755150383 and parameters: {'max_iter': 125, 'learning_rate': 0.06484953367802157, 'max_depth': 9, 'min_samples_leaf': 1, 'max_bins': 45}. Best is trial 1 with value: 0.6216345755150383.
[I 2024-11-13 17:02:26,942] Trial 2 finished with value: 0.613365204652152 and parameters: {'max_iter': 409, 'learning_rate': 0.08905679339440613, 'max_depth': 4, 'min_samples_leaf': 1, 'max_bins': 56}. Best is trial 1 with value: 0.6216345755150383.
[I 2024-11-13 17:03:19,103] Trial 3 finished with value: 0.6304626172965586 and parameters: {'max_iter': 110, 'learning_rate': 0.0682204173230295, 'max_depth': 10, 'min_samples_leaf': 4, 'max_bins': 11}. Best is trial 3 with value: 0.63046261729

In [149]:
# Imprimir los mejores resultados
print("Mejores hiperparámetros:", study.best_params)
print("Mejor valor de la función objetivo (precisión):", study.best_value)

Mejores hiperparámetros: {'max_iter': 304, 'learning_rate': 0.010296400176459743, 'max_depth': 5, 'min_samples_leaf': 5, 'max_bins': 14}
Mejor valor de la función objetivo (precisión): 0.6387458785494109


## Bagging - Adaboost

In [129]:
base_estimator = DecisionTreeClassifier(max_depth = 10)

In [130]:
adaboost = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=100, random_state=42)

In [131]:
bagging_adaboost = BaggingClassifier(
    base_estimator = adaboost,
    n_estimators = 20,
    random_state = 42,
    n_jobs = 4
)

In [132]:
bagging_adaboost.fit(X_train, y_train)

BaggingClassifier(base_estimator=AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=10),
                                                    n_estimators=100,
                                                    random_state=42),
                  n_estimators=20, n_jobs=4, random_state=42)

In [133]:
# Predicción
y_pred = bagging_adaboost.predict(X_test)

In [134]:
# Calcular precisión
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6390809414065228


In [135]:
joblib.dump(bagging_adaboost, './models/bagging_adaboost.joblib')

['./models/bagging_adaboost.joblib']

## PSO

### Random Forest

In [177]:
def objective_function(hyperparameters):
    n_estimators, max_depth = int(hyperparameters[0]), int(hyperparameters[1])
    
    model = RandomForestClassifier(
        n_estimators = n_estimators, 
        max_depth = max_depth, 
        random_state = 42
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return -accuracy  # Minimize function value (negative accuracy)


In [178]:
# Parameters to optimize
bounds = [(10, 200), (1, 20)]  # Example of parameter bounds
dim = 2  # Quantity of parameters

In [179]:
best_hyperparameters, best_score = particle_swarm_optimization(objective_function, bounds, dim)
print("Mejores hiperparámetros:", best_hyperparameters)
print("Mejor precisión obtenida:", best_score)

Iteración 1/10, Mejor precisión: 0.6553530274460507
Iteración 2/10, Mejor precisión: 0.6554228647251903
Iteración 3/10, Mejor precisión: 0.6594035896361478
Iteración 4/10, Mejor precisión: 0.6594035896361478
Iteración 5/10, Mejor precisión: 0.659892450590125
Iteración 6/10, Mejor precisión: 0.659892450590125
Iteración 7/10, Mejor precisión: 0.6601717997066834
Iteración 8/10, Mejor precisión: 0.6603813115441023
Iteración 9/10, Mejor precisión: 0.6604511488232419
Iteración 10/10, Mejor precisión: 0.6604511488232419
Mejores hiperparámetros: [119.32477184  16.38262364]
Mejor precisión obtenida: 0.6604511488232419


In [180]:
# Train the model with the best hyperparameters
best_n_estimators, best_max_depth = int(best_hyperparameters[0]), int(best_hyperparameters[1])

best_model = RandomForestClassifier(
    n_estimators=best_n_estimators, 
    max_depth=best_max_depth, 
    random_state=42
)

In [181]:
best_model.fit(X_train, y_train)

RandomForestClassifier(max_depth=16, n_estimators=119, random_state=42)

In [182]:
# Guardar el modelo entrenado
joblib.dump(best_model, './models/random_forest_pso.joblib')

['./models/random_forest_pso.joblib']

### HistGradientBoosting

In [183]:
def objective_function(hyperparameters):
    max_iter = int(hyperparameters[0])
    max_depth = int(hyperparameters[1])
    learning_rate = hyperparameters[2]
    min_samples_leaf = int(hyperparameters[3])
    max_bins = int(hyperparameters[4])
    
    model = HistGradientBoostingClassifier(
        max_iter=max_iter,
        learning_rate=learning_rate,
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf,
        max_bins=max_bins,
        random_state=42
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return -accuracy  # Minimize the function value (negative accuracy)

In [184]:
# Parámetros para PSO
bounds = [(100, 200), (3, 15), (0.01, 0.1), (1, 5), (8, 64)] 
dim = 5 

In [None]:
best_hyperparameters, best_score = particle_swarm_optimization(objective_function, bounds, dim)
print("Mejores hiperparámetros:", best_hyperparameters)
print("Mejor precisión obtenida:", best_score)