In [None]:
import pandas as pd
import numpy as np
from pprint import pprint

from matplotlib import pyplot as plt


from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.feature_selection import SelectFromModel

In [None]:
df = pd.read_csv ('Final_Data_w_Colors.csv')
pd.set_option('display.max_columns', None)
df = df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'background_image','name', 'total_reviews','owners'])

In [None]:
df2 = pd.get_dummies(df, drop_first=True)

In [None]:
# Binning the review_score, it will be the target of our machine learning.
# The idea is we are seeing what features of our games dataset: Genre, Boxart, Release Month, etc. could help predict how well recieved the game is.
bins = [-0.1,.60,.70,.80,.90,1]
group_names = ['1','2','3','4','5']
df2['target'] = pd.cut(df2['review_score'], bins, labels = group_names)
df2 = df2.drop(columns=['review_score'])

In [None]:
# Splitting out the target to get X and Y data
X = df2.drop('target', axis=1)
y = df2['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

scaler = StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# One hot encoding our target variable for machine learning.
num_classes = 6
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)


In [None]:
# # Because the data is fairly sparse, a lasso regression would be appropriate model.
# lasso_reg = Lasso(max_iter=2000).fit(X_train_scaled, y_train)
# lasso_reg.score(X_test_scaled, y_test)

# lasso_sel = SelectFromModel(lasso_reg)
# lasso_sel.fit(X_train_scaled, y_train)

# SelectFromModel(estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
#                                 max_iter=1000, normalize=False, positive=False,
#                                 precompute=False, random_state=None,
#                                 selection='cyclic', tol=0.0001,
#                                 warm_start=False), max_features=None, norm_order=1, prefit=False, threshold=None)

# X_lasso_train, X_lasso_test, y_train, y_test = train_test_split(lasso_sel.transform(X), y, random_state=1)
# scaler = StandardScaler().fit(X_lasso_train)
# X_lasso_train_scaled = scaler.transform(X_lasso_train)
# X_lasso_test_scaled = scaler.transform(X_lasso_test)

# log_reg = LogisticRegression()
# log_reg.fit(X_lasso_train_scaled, y_train)
# print(f'Training Score: {log_reg.score(X_lasso_train_scaled, y_train)}')
# print(f'Testing Score: {log_reg.score(X_lasso_test_scaled, y_test)}')

In [11]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 500, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(0, 100, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [0, 25, 50, 75, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [100, 200, 300, 400, 500]}


In [12]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
clf = RandomForestClassifier()
# Random search of parameters, using 2 fold cross validation, 
# search across 50 different combinations, and use all available cores
clf_random = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, n_iter = 50, cv = 2, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
clf_random.fit(X_train_scaled, y_train)

# n_iter is the number of interations we try
#cv is the number of cross validation folds we will try, to reduce chances of overfitting.

Fitting 2 folds for each of 50 candidates, totalling 100 fits


In [None]:
clf_random.best_params_

In [None]:
clf_best = RandomForestClassifier(n_estimators = 200, min_samples_split = 2, min_samples_leaf = 1, max_depth=None, bootstrap=False).fit(X_train_scaled, y_train)
print(f'Training Score: {clf_best.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf_best.score(X_test_scaled, y_test)}')

In [None]:
feature_importances = clf_best.feature_importances_

features = sorted(zip(X.columns, clf_best.feature_importances_), key = lambda x: x[1])
cols = [f[0] for f in features]
width = [f[1] for f in features]

fig, ax = plt.subplots()
fig.set_size_inches(10,20)
plt.margins(y=0.001)

ax.barh(y=cols, width=width)

plt.show()

In [None]:
sel = SelectFromModel(clf_best)
sel.fit(X_train_scaled, y_train)

X_selected_train, X_selected_test, y_train, y_test = train_test_split(sel.transform(X), y, random_state=1)
scaler = StandardScaler().fit(X_selected_train)
X_selected_train_scaled = scaler.transform(X_selected_train)
X_selected_test_scaled = scaler.transform(X_selected_test)

In [None]:
logistic = LogisticRegression()
logistic.fit(X_selected_train_scaled, y_train)
print(f'Training Score: {logistic.score(X_selected_train_scaled, y_train)}')
print(f'Testing Score: {logistic.score(X_selected_test_scaled, y_test)}')

In [None]:
df_columns = df2.columns.tolist()
df_columns.remove('target')
low_value_columns = []
for column in df_columns: 
    if df2[column].sum() <= 500:
        low_value_columns.append(column)
print(low_value_columns)

In [None]:
df_value = df2.drop(columns=low_value_columns)

In [None]:
# Splitting out the target to get X and Y data
X = df_value.drop('target', axis=1)
y = df_value['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

scaler = StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# One hot encoding our target variable for machine learning.
num_classes = 6
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)

In [None]:
clf_best = RandomForestClassifier(n_estimators = 200, min_samples_split = 2, min_samples_leaf = 1, max_depth=None, bootstrap=False).fit(X_train_scaled, y_train)
print(f'Training Score: {clf_best.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf_best.score(X_test_scaled, y_test)}')

In [None]:
import tensorflow as tf
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
nn_model = tf.keras.models.Sequential()

# First hidden layer
nn_model.add(tf.keras.layers.Dense(units=230, activation="relu", input_dim=92)) 

# Second hidden layer
nn_model.add(tf.keras.layers.Dense(units=62, activation="relu")) 

# Output layer
nn_model.add(tf.keras.layers.Dense(units=6, activation="softmax")) 

# Check the structure of the model
nn_model.summary() 

In [None]:
# Compile the model
nn_model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"]) 

In [None]:
# Train the model
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=50) 

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")