In [None]:
from sklearn.model_selection import train_test_split
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

In [None]:
games_dataset = pd.read_csv('vgsales.csv')
games_dataset.head(5)

# Analazying dataset

In [None]:
print('Genre:', len(games_dataset.Genre.unique()))
print('Platform:', len(games_dataset.Platform.unique()))
print('Publisher:', len(games_dataset.Publisher.unique()))

In [None]:
games_dataset['Year'].plot.box()

In [None]:
games_dataset.describe()

In [None]:
# Correlation before normalization

correlations = games_dataset.corr()
fig, ax = plt.subplots(figsize=(len(games_dataset.columns), len(games_dataset.columns)))

colormap = sns.color_palette("BrBG", 10)

sns.heatmap(correlations, 
    cmap=colormap, 
    annot=True, 
    fmt=".2f")

plt.show()

# Deleting unnecessary columns

In [None]:
games_dataset.drop('Name', inplace=True, axis=1)
games_dataset.drop('Rank', inplace=True, axis=1)
games_dataset.drop('NA_Sales', inplace = True, axis = 1)
games_dataset.drop('EU_Sales', inplace = True, axis = 1)
games_dataset.drop('JP_Sales', inplace = True, axis = 1)
games_dataset.drop('Other_Sales', inplace = True, axis = 1)
games_dataset.head(5)

In [None]:
# Dropping unnecessarily scarce data ( insignificant for model )

games_dataset = games_dataset.drop(games_dataset[games_dataset.Year < 1995].index)
games_dataset = games_dataset.drop(games_dataset[games_dataset.Year > 2019].index)

In [None]:
games_dataset['Year'].plot.box()

# Normalizing useful data to int types

In [None]:
# Function to convert useful data to int types in order

def convert_to_int_range(df, column_name):
    column_list = df[column_name].unique()
    
    column_dict = {}
    
    for i in range(len(column_list)):
        column_dict[column_list[i]] = i + 1
        
    return df.replace(column_dict.keys(), column_dict.values())

In [None]:
games_dataset = convert_to_int_range(games_dataset, 'Platform')
games_dataset = convert_to_int_range(games_dataset, 'Publisher')
games_dataset = convert_to_int_range(games_dataset, 'Genre')
games_dataset = games_dataset.reset_index(drop=True)
games_dataset.head(5)

In [None]:
# Correlation after normalization

correlations = games_dataset.corr()
fig, ax = plt.subplots(figsize=(len(games_dataset.columns), len(games_dataset.columns)))

colormap = sns.color_palette("BrBG", 10)

sns.heatmap(correlations, 
    cmap=colormap, 
    annot=True, 
    fmt=".2f")

plt.show()

# Preparing and training models

In [None]:
X = games_dataset.drop('Global_Sales', axis=1).to_numpy()
y = games_dataset.loc[:, 'Global_Sales'].to_numpy()

In [None]:
# Distributioning the dataset into training and test data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12345) 

In [None]:
# Function for training and evaluating models

def train_model(classifier, feature_vector_train, label, feature_vector_valid):
    classifier.fit(feature_vector_train, label)
    with open('datagames.pickle', 'wb') as handle:
        pickle.dump(classifier, handle)
    predictions = classifier.predict(feature_vector_valid)
    score_vals = [
        metrics.mean_squared_error(predictions, y_test),
        metrics.mean_absolute_error(predictions, y_test)
    ]
    return score_vals

In [None]:
# Model 1

accuracy = train_model(linear_model.LinearRegression(), X_train, y_train, X_test)
accuracy_compare = {'LR': accuracy}
print ("LR: ", accuracy)

In [None]:
# Model 2

regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
accuracy = train_model(regressor, X_train, y_train, X_test)
accuracy_compare['random forrest tree'] = accuracy
print ('random forrest tree' , accuracy)

# Comparing models

In [None]:
df_compare = pd.DataFrame(accuracy_compare, index = ['mse', 'mae'])
df_compare.plot(kind='bar')

# Fine tuning

In [None]:
# Model 3

regressor = RandomForestRegressor(n_estimators = 300, random_state = 0)
accuracy = train_model(regressor, X_train, y_train, X_test)
accuracy_compare['random forrest tree improved'] = accuracy
print ('random forrest tree improved' , accuracy)

# Model valuation

In [None]:
regressor.score(X_test,y_test)

# Manual test

In [None]:
#format regressor.predict([[Platform, Year, Genre, Publisher]])

regressor.predict([[3, 2019, 2, 1]])

In [None]:
games_dataset.describe()