# Bibliotecas

In [None]:
# Import required libraries

# linear algebra and data processing libraries
import numpy as np
import pandas as pd

# scikit-learn libraries
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

# Graphics Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

# Utils
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Preparação e Limpeza dos Dados

Nesta etapa, realizamos o processamento inicial do dataset `boardgames.csv`:

1.  **Carregamento:** Leitura dos dados brutos.
2.  **Seleção de Features:** Filtragem das colunas relevantes para a análise.
3.  **Engenharia de Atributos:** Criação da variável `rating_category` para classificar os jogos baseados na nota média (*average*).

In [14]:
# Load dataset

games_df = pd.read_csv('boardgames.csv')

#Filtrar colunas desnecessárias

cols_to_keep = [
    'primary',
    'yearpublished',
    'minplayers',
    'maxplayers',
    'minplaytime',
    'minage',
    'boardgamecategory',
    'boardgamemechanic',
    'boardgamefamily',
    'boardgamedesigner',
    'boardgameartist',
    'boardgamepublisher',
    'usersrated',
    'bayesaverage',
    'playingtime',
    'averageweight',
    'average'
]

games_df = games_df[cols_to_keep]

# Definição dos Novos Limites (Bins) e Rótulos (Labels)
bins = [0, 4, 6.2, 7.5, 10.1]
labels = ['bad', 'mediocre', 'good', 'excelent']

# Criação da nova coluna 'rating_category'
games_df['rating_category'] = pd.cut(
    games_df['average'],
    bins=bins,
    labels=labels,
    right=True,
    include_lowest=True
)

print("Nr. rows - train: ", len(games_df))

Nr. rows - train:  21632


# Supervised Learning

In [None]:
features = ["yearpublished","minplayers","maxplayers","playingtime","minage","usersrated","averageweight","bayesaverage"]
games_df = games_df[features].dropna()

bins   = [0, 4, 6.2, 7.5, 10.1]
labels = ['bad', 'mediocre', 'good', 'excelent']

games_df["target_class"] = pd.cut(
    games_df["bayesaverage"],
    bins=bins,
    labels=labels,
    include_lowest=True
)

games_df = games_df.dropna(subset=["target_class"])

X = games_df.drop(columns=["bayesaverage", "target_class"])
y = games_df["target_class"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

models = {
    "LogReg": Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=3000))
    ]),
    "SVM": Pipeline([
        ("scaler", StandardScaler()),
        ("clf", SVC())
    ]),
    "RF": RandomForestClassifier(random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    print("\n===", name, "===")
    print("Accuracy:", accuracy_score(y_test, pred))
    print(classification_report(y_test, pred, digits=3))
    print(confusion_matrix(y_test, pred))

param_grid = {
    "n_estimators": [200, 400],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}

gs = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    n_jobs=-1,
    scoring="f1_macro"
)
gs.fit(X_train, y_train)

best = gs.best_estimator_
pred = best.predict(X_test)

print("\n=== Best RF (GridSearch) ===")
print("Best params:", gs.best_params_)
print(classification_report(y_test, pred, digits=3))
print(confusion_matrix(y_test, pred))



=== LogReg ===
Accuracy: 0.9579385255373237
              precision    recall  f1-score   support

         bad      0.000     0.000     0.000         2
    excelent      0.667     0.286     0.400        21
        good      0.851     0.562     0.677       336
    mediocre      0.964     0.995     0.980      3968

    accuracy                          0.958      4327
   macro avg      0.621     0.461     0.514      4327
weighted avg      0.954     0.958     0.953      4327

[[   0    0    0    2]
 [   0    6   15    0]
 [   0    3  189  144]
 [   0    0   18 3950]]

=== SVM ===
Accuracy: 0.9657961636237578
              precision    recall  f1-score   support

         bad      0.000     0.000     0.000         2
    excelent      0.900     0.429     0.581        21
        good      0.860     0.676     0.757       336
    mediocre      0.973     0.994     0.983      3968

    accuracy                          0.966      4327
   macro avg      0.683     0.524     0.580      4327
weigh