In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn import utils

In [None]:
from Normalizer import Normalizer
from DataManager import DataManager

In [None]:
Manager = DataManager()

games_dataset = Manager.get_data('vgsales.csv')
games_dataset.head(5)

In [None]:
Manager.get_unique_col(games_dataset, 'Genre')

In [None]:
Manager.get_unique_col(games_dataset, 'Platform')

In [None]:
len(Manager.get_unique_col(games_dataset, 'Publisher'))

In [None]:
games_dataset['Year'].plot.box()


In [None]:
Manager.show_df_info(games_dataset)

In [None]:
# Correlation before normalization

correlations = games_dataset.corr()
fig, ax = plt.subplots(figsize=(len(games_dataset.columns), len(games_dataset.columns)))

colormap = sns.color_palette("BrBG", 10)

sns.heatmap(correlations, 
    cmap=colormap, 
    annot=True, 
    fmt=".2f")

plt.show()

# Deleting unnecessary columns

In [None]:
# Dropping useless data

Manager.rename_cols(games_dataset, ['Platform'], ['Device'])

Manager.drop_cols(games_dataset, ['Name', 'Rank', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales'])

games_dataset.head(5)

In [None]:
# Dropping unnecessarily scarce data ( insignificant for model )

games_dataset = games_dataset.drop(games_dataset[games_dataset.Year < 1995].index)
games_dataset = games_dataset.drop(games_dataset[games_dataset.Year > 2019].index)

In [None]:
games_dataset['Year'].plot.box()

# Normalizing useful data to int types

In [None]:
# Converts a column with multiple keyword existences to seperate binary columns
Norm = Normalizer()

Norm.one_hot_encoder(games_dataset, games_dataset['Genre'].unique(), 'Genre')
Norm.one_hot_encoder(games_dataset, games_dataset['Device'].unique(), 'Device')
Norm.one_hot_encoder(games_dataset, games_dataset['Publisher'].unique(), 'Publisher')

In [None]:
games_dataset.head(5)

In [None]:
# Remove any Nan columns
games_dataset = Manager.remove_null_values(games_dataset)
games_dataset = Manager.remove_null_cols(games_dataset)

In [None]:
x = games_dataset.drop('Global_Sales', axis=1).to_numpy()
y = games_dataset.loc[:, 'Global_Sales'].to_numpy()

In [None]:
# Scaling data
scaler = StandardScaler()
scaler.fit(x)
x_scaled = scaler.fit_transform(x)

In [None]:
pca = PCA()
pca.fit(x_scaled)

In [None]:
components = None
components = len(pca.explained_variance_ratio_) \
    if components is None else components

plt.plot(range(1, components+1), np.cumsum(pca.explained_variance_ratio_ * 100))
plt.xlabel("Number of components")
plt.ylabel("Explained variance (%)")

In [None]:
pca = PCA(n_components = 0.98)
pca.fit(x_scaled)

In [None]:
print(pca.explained_variance_ratio_.cumsum() * 100)

In [None]:
x_pca = pca.transform(x_scaled)
print(x_pca.shape)
print(x_pca)

In [None]:
_sc = StandardScaler()
_pca = PCA(n_components = components)
_model = LogisticRegression()
log_regress_model = Pipeline([
    ('std_scaler', _sc),
    ('pca', _pca),
    ('regressor', _model)
])

In [None]:
#convert y values to categorical values
lab = preprocessing.LabelEncoder()
y = lab.fit_transform(y)

#view transformed values
print(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, shuffle=True, random_state=12345)
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
log_reg.score(X_test,y_test)