In [5]:
# XGBoost classification - halvinggridsearch

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

from sklearn.preprocessing import LabelEncoder

data = pd.read_json('./content/video_games_newer_all_data_with_rawg_final_for_train_v2.json')
data = data[data.NA_Sales <= 6]
data = data[data.EU_Sales <= 2]
data = data[data.JP_Sales <= 0.7]
data = data[data.Other_Sales <= 2]
data = data[data.Global_Sales <= 6]
data = data.drop(columns = ['Year_of_Release','Global_Sales'])

scaler = StandardScaler()

data['User_Score'] = np.where(data['User_Score'] < 7.5, 0, data['User_Score'])
data['User_Score'] = np.where(7.5 <= data['User_Score'] , 1, data['User_Score'])

training, test = train_test_split(data, test_size = 0.25, random_state = 32)

features = training.drop(['User_Score'], axis = 1).columns

training_features, training_labels = training[features], training['User_Score']
training[['added', 'screenshots_count', 
     'achievements_count',    'reddit_count',    'twitch_count',    'youtube_count',
     'ratings_count',    'suggestions_count',    'additions_count',
     'game_series_count',    'reviews_count','NA_Sales','EU_Sales','JP_Sales','Other_Sales']] = scaler.fit_transform(training[['added', 'screenshots_count', 
     'achievements_count',    'reddit_count',    'twitch_count',    'youtube_count',
     'ratings_count',    'suggestions_count',    'additions_count',
     'game_series_count',    'reviews_count','NA_Sales','EU_Sales','JP_Sales','Other_Sales']])
test_features, test_labels = test[features], test['User_Score']
test[['added', 'screenshots_count', 
     'achievements_count',    'reddit_count',    'twitch_count',    'youtube_count',
     'ratings_count',    'suggestions_count',    'additions_count',
     'game_series_count',    'reviews_count','NA_Sales','EU_Sales','JP_Sales','Other_Sales']] = scaler.transform(test[['added', 'screenshots_count', 
     'achievements_count',    'reddit_count',    'twitch_count',    'youtube_count',
     'ratings_count',    'suggestions_count',    'additions_count',
     'game_series_count',    'reviews_count','NA_Sales','EU_Sales','JP_Sales','Other_Sales']])

params = { 
    'classifier__max_depth': [3,6,10],
    'classifier__learning_rate': [0.001, 0.005, 0.01],
    'classifier__n_estimators': [100, 500, 1000],
    'classifier__colsample_bytree': [0.1, 0.3, 0.7],
    'classifier__lambda': [1],
    'classifier__gamma': [0]
}

xgb_pipeline = Pipeline([('classifier', XGBClassifier())])
clf = HalvingGridSearchCV(
    estimator=xgb_pipeline, 
    param_grid=params,               
    scoring='f1_macro', 
    verbose=1
)
clf.fit(training_features, training_labels)
print("Best parameters:", clf.best_params_)
print("Best accuracy: ", clf.best_score_)
f1_macro = clf.score(test_features, test_labels)
print(f'Score {f1_macro}')

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 32
max_resources_: 2664
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 81
n_resources: 32
Fitting 5 folds for each of 81 candidates, totalling 405 fits
----------
iter: 1
n_candidates: 27
n_resources: 96
Fitting 5 folds for each of 27 candidates, totalling 135 fits
----------
iter: 2
n_candidates: 9
n_resources: 288
Fitting 5 folds for each of 9 candidates, totalling 45 fits
----------
iter: 3
n_candidates: 3
n_resources: 864
Fitting 5 folds for each of 3 candidates, totalling 15 fits
----------
iter: 4
n_candidates: 1
n_resources: 2592
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best parameters: {'classifier__colsample_bytree': 0.3, 'classifier__gamma': 0, 'classifier__lambda': 1, 'classifier__learning_rate': 0.005, 'classifier__max_depth': 3, 'classifier__n_estimators': 1000}
Best accuracy:  0.6395111697599114
Score 0.6274433838880681
