In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor


with open('./vgsales_extended.csv', mode='r') as file:
    df = pd.read_csv(file)

df.head()
df.drop(columns=['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Name'], inplace=True)

top_publishers = df.groupby('Publisher').size().sort_values(ascending=False).head(10).index.to_list()
for i in top_publishers:
    print(f'-) {i}')
    
top_platforms = df.groupby('Platform').size().sort_values(ascending=False).head(10).index.to_list()
for i in top_platforms:
    print(f'-) {i}')
    
top_developers = df.groupby('Developer').size().sort_values(ascending=False).head(20).index.to_list()
for i in top_developers:
    print(f'-) {i}')

df['Platform'] = df['Platform'].where(df['Platform'].isin(top_platforms), 'other')
df.groupby('Platform').size().sort_values(ascending=False)

df['Publisher'] = df['Publisher'].where(df['Publisher'].isin(top_publishers), 'other')
df.groupby('Publisher').size().sort_values(ascending=False)

df['Developer'] = df['Developer'].where(df['Developer'].isin(top_developers), 'other')
df.groupby('Developer').size().sort_values(ascending=False)

df.dropna(subset=['Year_of_Release', 'Publisher', 'User_Count', 'Developer', 'Rating'], inplace=True)
y = np.array(df['Global_Sales'])
df.drop(columns=['Global_Sales'], inplace=True)

numeric_features = [j for j in df.columns if df[j].dtype in [np.int64, np.float64]]
categorical_features = list(filter(lambda x: x not in numeric_features, df.columns))

pipe_numeric = Pipeline(steps=[('imputation', SimpleImputer(strategy='median')),
                               ('scaling', MinMaxScaler())])
pipe_categorical = Pipeline(steps=[('encoding', OneHotEncoder())])

preprocessor_X = ColumnTransformer(transformers=[
    ('numerical', pipe_numeric, numeric_features),
    ('categorical', pipe_categorical, categorical_features)])

X = preprocessor_X.fit_transform(df)
transformer_y = RobustScaler()
transformer_y.fit(y.reshape(-1,1))
y = transformer_y.transform(y.reshape(-1,1))

train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=42)

-) Electronic Arts
-) Activision
-) Namco Bandai Games
-) Ubisoft
-) Konami Digital Entertainment
-) THQ
-) Nintendo
-) Sony Computer Entertainment
-) Sega
-) Take-Two Interactive
-) PS2
-) DS
-) PS3
-) Wii
-) X360
-) PSP
-) PS
-) PC
-) XB
-) GBA
-) Ubisoft
-) EA Sports
-) EA Canada
-) Konami
-) Capcom
-) EA Tiburon
-) Electronic Arts
-) Ubisoft Montreal
-) Visual Concepts
-) Omega Force
-) Traveller's Tales
-) Vicarious Visions
-) Activision
-) TT Games
-) Nintendo
-) THQ
-) Namco
-) Codemasters
-) Artificial Mind and Movement
-) Midway


In [None]:
param_grid_forest = {'max_depth': [3,5,10],
                  'criterion': ['mse', 'mae'],
                  'n_estimators': [5,10,20,100]}
grid_forest = GridSearchCV(RandomForestRegressor(random_state=42), param_grid_forest, n_jobs=-1)
grid_forest.fit(train_X, train_y)
grid_forest.best_score_

In [None]:
param_grid_gradient = {'max_depth': [3,5,10],
                       'loss': ['ls', 'huber'],
                       'n_estimators': [5,10,50]}
grid_gradient = GridSearchCV(GradientBoostingRegressor(random_state=42), param_grid_gradient, n_jobs=-1)
grid_gradient.fit(train_X, train_y)
grid_gradient.best_score_

In [None]:
param_grid_ada = {'base_estimator': [DecisionTreeRegressor(max_dept=3), grid_forest.best_estimator],
                  'loss': ['linear', 'square'],
                  'n_estimators': [5,10,50]}
grid_ada = GridSearchCV(AdaBoostRegressor(random_state=42), param_grid_ada, n_jobs=-1)
grid_ada.fit(train_X, train_y)
grid_ada.best_score_

In [None]:
for i in [grid_forest, grid_gradient, grid_ada]:
    estimator = i.best_estimator_
    print(estimator.score(test_X, test_y))