# IMPORT LIBRARIES

In [4]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score # root_mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

from skopt.space import Real, Integer
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# library for hyperparameter tuning
import optuna
from skopt import BayesSearchCV
from optimizers.PSO import particle_swarm_optimization

# library for save model
import joblib

import warnings
warnings.filterwarnings('ignore')

# GET DATA

In [5]:
games = pd.read_csv("../data/games_normalized_clean.csv")

# Models

In [6]:
games_labels = games["Estimated owners"]
games = games.drop("Estimated owners", axis = 1)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(games, games_labels, test_size = 0.2, random_state= 42)

# Models

## HistGradientBoosting

In [8]:
model = HistGradientBoostingClassifier(max_iter = 100, max_depth = 100, random_state=42)
model.fit(X_train, y_train)

HistGradientBoostingClassifier(max_depth=100, random_state=42)

In [9]:
y_predict = model.predict(X_test)

In [10]:
accuracy_score(y_test, y_predict)

0.6594035896361478

## CatBoost

In [11]:
model = CatBoostClassifier(iterations=1000, 
                           depth= 10, 
                           learning_rate=0.1, 
                           loss_function='MultiClass', 
                           cat_features=[],
                           random_seed=42,
                           verbose=100)

In [12]:
model.fit(X_train, y_train)

0:	learn: 1.2686582	total: 439ms	remaining: 7m 18s
100:	learn: 0.6374868	total: 38.5s	remaining: 5m 42s
200:	learn: 0.5747659	total: 1m 19s	remaining: 5m 14s
300:	learn: 0.5229620	total: 1m 54s	remaining: 4m 26s
400:	learn: 0.4826596	total: 2m 30s	remaining: 3m 44s
500:	learn: 0.4483201	total: 3m 6s	remaining: 3m 5s
600:	learn: 0.4178366	total: 3m 42s	remaining: 2m 27s
700:	learn: 0.3907541	total: 4m 23s	remaining: 1m 52s
800:	learn: 0.3668014	total: 4m 58s	remaining: 1m 14s
900:	learn: 0.3451251	total: 5m 34s	remaining: 36.8s
999:	learn: 0.3263820	total: 6m 10s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x2311a55e348>

In [13]:
y_pred = model.predict(X_test)

In [14]:
accuracy_score(y_test, y_pred)

0.6360081011243802

## Hyper-parameter Optimizer

## Bayesian-Search

### Gaussian Process

#### HistGradientBoosting

In [15]:
param_space = {
    'learning_rate': Real(0.01, 0.2),
    'max_iter': Integer(50, 500),
    'max_depth': Integer(3, 15),
    'min_samples_leaf': Integer(1, 20),
}

In [16]:
bayes_search = BayesSearchCV(
    estimator = HistGradientBoostingClassifier(),
    search_spaces = param_space,
    n_iter = 32, 
    scoring = 'accuracy',
    cv = 5,
    n_jobs = 4,
    verbose = 0,
    random_state = 42,
    optimizer_kwargs = {'base_estimator': 'GP'}
)

In [17]:
bayes_search.fit(X_train, y_train)

BayesSearchCV(cv=5, estimator=HistGradientBoostingClassifier(), n_iter=32,
              n_jobs=4, optimizer_kwargs={'base_estimator': 'GP'},
              random_state=42, scoring='accuracy',
              search_spaces={'learning_rate': Real(low=0.01, high=0.2, prior='uniform', transform='normalize'),
                             'max_depth': Integer(low=3, high=15, prior='uniform', transform='normalize'),
                             'max_iter': Integer(low=50, high=500, prior='uniform', transform='normalize'),
                             'min_samples_leaf': Integer(low=1, high=20, prior='uniform', transform='normalize')})

In [18]:
# Best parameters
print("Best score:", bayes_search.best_score_)
print("Best parameters:", bayes_search.best_params_)

Best score: 0.6562778409483683
Best parameters: OrderedDict([('learning_rate', 0.05079206891199923), ('max_depth', 3), ('max_iter', 393), ('min_samples_leaf', 15)])


In [19]:
# Evaluar el conjunto de datos de prueba
print("Test accuracy:", bayes_search.score(X_test, y_test))

Test accuracy: 0.6626161044765696


In [20]:
joblib.dump(bayes_search, './models/bayes_search_hist_gradient_boosting.joblib')

['./models/bayes_search_hist_gradient_boosting.joblib']

### Tree Structure Parzen

#### HistGradientBoosting

In [21]:
def objective(trial):
    max_iter = trial.suggest_int('max_iter', 100, 500)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.1)
    max_depth = trial.suggest_int('max_depth', 3, 15) 
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)
    max_bins = trial.suggest_int('max_bins', 8, 64)

    model = HistGradientBoostingClassifier(
        max_iter=max_iter,
        learning_rate=learning_rate,
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf,
        max_bins=max_bins,
        random_state=42
    )
    
    score = cross_val_score(model, games, games_labels, cv=5, scoring='accuracy')
    
    return np.mean(score)

In [22]:
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler())

[I 2024-11-19 15:45:25,488] A new study created in memory with name: no-name-19a221e2-dcf7-4c61-bc43-2abc86e72337


In [23]:
# Ready to optimize
study.optimize(objective, n_trials=50)

[I 2024-11-19 15:46:18,964] Trial 0 finished with value: 0.6187291447217966 and parameters: {'max_iter': 483, 'learning_rate': 0.06615169772779768, 'max_depth': 3, 'min_samples_leaf': 5, 'max_bins': 33}. Best is trial 0 with value: 0.6187291447217966.
[I 2024-11-19 15:48:11,384] Trial 1 finished with value: 0.6265234855141201 and parameters: {'max_iter': 371, 'learning_rate': 0.023792707572074932, 'max_depth': 13, 'min_samples_leaf': 2, 'max_bins': 56}. Best is trial 1 with value: 0.6265234855141201.
[I 2024-11-19 15:49:09,982] Trial 2 finished with value: 0.6213551737205488 and parameters: {'max_iter': 303, 'learning_rate': 0.06354918105440405, 'max_depth': 11, 'min_samples_leaf': 3, 'max_bins': 55}. Best is trial 1 with value: 0.6265234855141201.
[I 2024-11-19 15:51:03,934] Trial 3 finished with value: 0.5868531687838658 and parameters: {'max_iter': 231, 'learning_rate': 0.015760934632558343, 'max_depth': 5, 'min_samples_leaf': 3, 'max_bins': 51}. Best is trial 1 with value: 0.626523

In [24]:
# Imprimir los mejores resultados
print("Mejores hiperparámetros:", study.best_params)
print("Mejor valor de la función objetivo (precisión):", study.best_value)

Mejores hiperparámetros: {'max_iter': 149, 'learning_rate': 0.011032974006294428, 'max_depth': 11, 'min_samples_leaf': 5, 'max_bins': 8}
Mejor valor de la función objetivo (precisión): 0.6421402727259398


## Bagging - Adaboost

In [25]:
base_estimator = DecisionTreeClassifier(max_depth = 10)

In [26]:
adaboost = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=100, random_state=42)

In [27]:
bagging_adaboost = BaggingClassifier(
    base_estimator = adaboost,
    n_estimators = 20,
    random_state = 42,
    n_jobs = 4
)

In [28]:
bagging_adaboost.fit(X_train, y_train)

BaggingClassifier(base_estimator=AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=10),
                                                    n_estimators=100,
                                                    random_state=42),
                  n_estimators=20, n_jobs=4, random_state=42)

In [29]:
# Predicción
y_pred = bagging_adaboost.predict(X_test)

In [30]:
# Calcular precisión
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6364271247992178


In [31]:
joblib.dump(bagging_adaboost, './models/bagging_adaboost.joblib')

['./models/bagging_adaboost.joblib']

## PSO

### Random Forest

In [32]:
def objective_function(hyperparameters):
    n_estimators, max_depth = int(hyperparameters[0]), int(hyperparameters[1])
    
    model = RandomForestClassifier(
        n_estimators = n_estimators, 
        max_depth = max_depth, 
        random_state = 42
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return -accuracy  # Minimize function value (negative accuracy)


In [33]:
# Parameters to optimize
bounds = [(10, 200), (1, 20)]  # Example of parameter bounds
dim = 2  # Quantity of parameters

In [34]:
best_hyperparameters, best_score = particle_swarm_optimization(objective_function, bounds, dim)
print("Mejores hiperparámetros:", best_hyperparameters)
print("Mejor precisión obtenida:", best_score)

Iteración 1/10, Mejor precisión: 0.6570989594245408
Iteración 2/10, Mejor precisión: 0.6571687967036804
Iteración 3/10, Mejor precisión: 0.6571687967036804
Iteración 4/10, Mejor precisión: 0.6571687967036804
Iteración 5/10, Mejor precisión: 0.6594035896361478
Iteración 6/10, Mejor precisión: 0.6594035896361478
Iteración 7/10, Mejor precisión: 0.6594035896361478
Iteración 8/10, Mejor precisión: 0.6596829387527062
Iteración 9/10, Mejor precisión: 0.659892450590125
Iteración 10/10, Mejor precisión: 0.6608701724980794
Mejores hiperparámetros: [78.65873191 14.66472286]
Mejor precisión obtenida: 0.6608701724980794


In [35]:
# Train the model with the best hyperparameters
best_n_estimators, best_max_depth = int(best_hyperparameters[0]), int(best_hyperparameters[1])

best_model = RandomForestClassifier(
    n_estimators=best_n_estimators, 
    max_depth=best_max_depth, 
    random_state=42
)

In [36]:
best_model.fit(X_train, y_train)

RandomForestClassifier(max_depth=14, n_estimators=78, random_state=42)

In [37]:
# Guardar el modelo entrenado
joblib.dump(best_model, './models/random_forest_pso.joblib')

['./models/random_forest_pso.joblib']

### HistGradientBoosting

In [38]:
def objective_function(hyperparameters):
    max_iter = int(hyperparameters[0])
    max_depth = int(hyperparameters[1])
    learning_rate = hyperparameters[2]
    min_samples_leaf = int(hyperparameters[3])
    max_bins = int(hyperparameters[4])
    
    model = HistGradientBoostingClassifier(
        max_iter=max_iter,
        learning_rate=learning_rate,
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf,
        max_bins=max_bins,
        random_state=42
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return -accuracy  # Minimize the function value (negative accuracy)

In [39]:
# Parámetros para PSO
bounds = [(100, 200), (3, 15), (0.01, 0.1), (1, 5), (8, 64)] 
dim = 5 

In [40]:
best_hyperparameters, best_score = particle_swarm_optimization(objective_function, bounds, dim)
print("Mejores hiperparámetros:", best_hyperparameters)
print("Mejor precisión obtenida:", best_score)

Iteración 1/10, Mejor precisión: 0.3299811439346323
Iteración 2/10, Mejor precisión: 0.6598226133109854
Iteración 3/10, Mejor precisión: 0.6640128500593617
Iteración 4/10, Mejor precisión: 0.6645017110133389
Iteración 5/10, Mejor precisión: 0.6645017110133389
Iteración 6/10, Mejor precisión: 0.6645017110133389
Iteración 7/10, Mejor precisión: 0.6645017110133389
Iteración 8/10, Mejor precisión: 0.6645715482924786
Iteración 9/10, Mejor precisión: 0.6645715482924786
Iteración 10/10, Mejor precisión: 0.6645715482924786
Mejores hiperparámetros: [1.70963685e+02 4.86906876e+00 1.00000000e-01 3.03569600e+00
 6.40000000e+01]
Mejor precisión obtenida: 0.6645715482924786


In [48]:
max_depth, max_iter, learning_rate, min_samples_leaf, max_bins = best_hyperparameters

In [51]:
hist_gradient_boosting_pso = HistGradientBoostingClassifier(
    max_iter= int(max_iter),
    learning_rate= learning_rate,
    max_depth= int(max_depth),
    min_samples_leaf = int(min_samples_leaf),
    max_bins= int(max_bins),
    random_state=42
)

In [52]:
hist_gradient_boosting_pso.fit(X_train, y_train)

HistGradientBoostingClassifier(max_bins=64, max_depth=170, max_iter=4,
                               min_samples_leaf=3, random_state=42)

In [55]:
joblib.dump(hist_gradient_boosting_pso, './models/hist_gradient_boosting_pso.joblib')

['./models/hist_gradient_boosting_pso.joblib']

# Zip Models

In [41]:
import zipfile

In [56]:
with zipfile.ZipFile('./models/models.zip', 'w') as zipf:
    zipf.write('./models/random_forest_pso.joblib')
    zipf.write('./models/bayes_search_hist_gradient_boosting.joblib')
    zipf.write('./models/hist_gradient_boosting_pso.joblib')
    zipf.write('./models/label_encoder.joblib')
    zipf.write('./models/pca_model.joblib')