# ESRB rating machine learning

In [69]:
%pip install numpy==1.24.1
%pip install pandas==1.5.3
%pip install seaborn==0.12.2
%pip install scikit-learn==1.2.0
%pip install plotly==5.12.0
%pip install matplotlib==3.6.3

Note: you may need to restart the kernel to use updated packages.


Note: you may need to restart the kernel to use updated packages.


In [None]:
# import general libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Dataset

In [None]:
# CSV file with the ESRB rating data
df = pd.read_csv('Video_games_esrb_rating.csv')

In [None]:
df.head()

## Pre processing

In [None]:
# Correcting typo in the original dataframe
df = df.rename(columns={"strong_janguage": "strong_language"})

In [None]:
# check all columns
# check if there are no missing values
df.info()

In [None]:
# drop the title and console columns, because it is not needed for the analysis
df.drop(['title', 'console'], inplace=True, axis=1)

In [None]:
ratings = df['esrb_rating'].unique()
ratings

In [None]:
df.head()

In [None]:
# Replace the ESRB ratings with numbers, this is needed for some of the models
df['esrb_rating'] = df['esrb_rating'].replace(
    'E', 0).replace('ET', 1).replace('T', 2).replace('M', 3)

In [None]:
y = df['esrb_rating']
X = df.drop(['esrb_rating'], axis=1)

## Splitting dataset

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
SEED = 42
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=SEED)

## Decision tree (Anwar Ammour)

In [None]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred_tree = classifier.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Print confusion matrix and classification report
print(confusion_matrix(y_test, y_pred_tree))
print(classification_report(y_test, y_pred_tree))

In [None]:
# print feature importance
features_df = pd.DataFrame(
    {'features': classifier.feature_names_in_, 'importances': classifier.feature_importances_})

# Sorting data from highest to lowest
features_df_sorted = features_df.sort_values(by='importances', ascending=False)

# Barplot of the result without borders and axis lines
g = sns.barplot(data=features_df_sorted, x='importances',
                y='features', palette="rocket")
sns.despine(bottom=True, left=True)
g.set_title('Feature importances')
g.set(xlabel=None)
g.set(ylabel=None)
g.set(xticks=[])
for value in g.containers:
    g.bar_label(value, padding=2)

## Random Forest (Remco de Wilde)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create instance of random forest classifier
rfc = RandomForestClassifier(n_estimators=20,
                             max_depth=11,
                             random_state=SEED)

In [None]:
# Train the classifier
rfc.fit(X_train, y_train)
# Predict the labels of the test set
y_pred_rt = rfc.predict(X_test)

In [None]:
features = X.columns.values  # The name of each column

# for estimator in rfc.estimators_:
#     print(estimator)
#     plt.figure(figsize=(12, 6))
#     tree.plot_tree(estimator,
#                    feature_names=features,
#                    class_names=ratings,
#                    fontsize=8,
#                    filled=True,
#                    rounded=True)
#     plt.show()

In [None]:
cm = confusion_matrix(y_test, y_pred_rt)
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens').set_title('ESRB Rating')

print(classification_report(y_test,y_pred_rt))

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred_rt))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred_rt))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred_rt)))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, multilabel_confusion_matrix

mcm = multilabel_confusion_matrix(y_test, y_pred_rt)

for confusion_matrix in mcm:
    disp = ConfusionMatrixDisplay(confusion_matrix, display_labels=['T','F'])
    disp.plot(include_values=True, cmap="Greens", ax=None, xticks_rotation="vertical")
    plt.show()

In [None]:
# Create a dataframe with the features and their importance
features_df = pd.DataFrame(
    {'features': rfc.feature_names_in_, 'importances': rfc.feature_importances_})

# Sorting data from highest to lowest
features_df_sorted = features_df.sort_values(by='importances', ascending=False)

# Barplot of the result without borders and axis lines
g = sns.barplot(data=features_df_sorted, x='importances',
                y='features', palette="rocket")
sns.despine(bottom=True, left=True)
g.set_title('Feature importances')
g.set(xlabel=None)
g.set(ylabel=None)
g.set(xticks=[])
for value in g.containers:
    g.bar_label(value, padding=20)
    g.margins(y=0.005)

## k-NN (Thom Hooijer)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# Scale the data
scaler = StandardScaler()

In [None]:
# Scale both X_train and X_test
X_train_knn = scaler.fit_transform(X_train)
X_test_knn = scaler.transform(X_test)

In [None]:
# Create a KNN classifier
knnc = KNeighborsClassifier()
knnc.fit(X_train_knn, y_train)

In [None]:
# Predict the classes of the test data
y_pred_knn = knnc.predict(X_test_knn)

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test.values, y_pred_knn)

In [None]:
# Adding classes names for better interpretation
classes_names = ratings
cm = pd.DataFrame(confusion_matrix(y_test, y_pred_knn), 
                  columns=classes_names, index = classes_names)
                  
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens').set_title('ESRB Rating')
print(classification_report(y_test, y_pred_knn))

In [None]:
from sklearn.metrics import f1_score
f1s = []

# Calculating f1 score for K values between 1 and 40
for i in range(1, 40):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    # using average='weighted' to calculate a weighted average for the 4 classes 
    f1s.append(f1_score(y_test, pred_i, average='weighted'))

In [None]:
# Plotting the f1 score for each K value
plt.figure(figsize=(12, 6))
plt.plot(range(1, 40), f1s, color='blue', marker='.',
         markerfacecolor='blue', markersize=10)
plt.title('F1 Score K Value')
plt.xlabel('K Value')
plt.ylabel('F1 Score')

In [None]:
# Plotting the f1 score for each K value
t = pd.DataFrame(f1s)

In [None]:
# Get the index of the maximum f1 score value, this is the optimal K value
optimal_n_index_value = t.idxmax().tolist()[0]
# Add 1 to fix the index
optimal_n_index_value += 1

In [None]:
# Create a KNN classifier with a optimal k value (K=6)
optimal_knn_classifier = KNeighborsClassifier(n_neighbors=optimal_n_index_value)
optimal_knn_classifier.fit(X_train, y_train)
optimal_knn_pred = optimal_knn_classifier.predict(X_test)
print(classification_report(y_test, optimal_knn_pred))

## Support Vector Machine (Remco de Wilde)

In [None]:
from sklearn.svm import SVC

In [None]:
# Create the model
svclassifier = SVC(kernel='linear', probability=True)
# Train the model
svclassifier.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred_svm = svclassifier.predict(X_test)

In [None]:
# Evaluate the model
cm = confusion_matrix(y_test,y_pred_svm)
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens', xticklabels=ratings, yticklabels=ratings).set_title('ESRB Rating')

In [None]:
# Print the classification report
print(classification_report(y_test,y_pred_svm))

## Ensamble Voting Classifier (Remco de Wilde)

In [None]:
from sklearn.ensemble import VotingClassifier

eclf = VotingClassifier(estimators=[('RF', rfc), ('DT', classifier), ('SVC', svclassifier), ('KNN', optimal_knn_classifier)],
                        voting='soft')

In [None]:
# Training the voting classifier
eclf = eclf.fit(X, y)

In [None]:
y_pred_eclf = eclf.predict(X_test)

In [None]:
cm = confusion_matrix(y_test,y_pred_eclf)

In [None]:
Labels = ratings
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens', xticklabels=Labels, yticklabels=Labels).set_title('ESRB Rating')

In [None]:
# Printing the classification report
print(classification_report(y_test,y_pred_eclf))

## Conclusie

In [None]:
# should make a conclusion here!!!

## Testen voor variabelen

### Testen voor `max_depth` Random Forest

In [None]:
data = []

for i in range(1, 20):
    rfc2 = RandomForestClassifier(n_estimators=20,
                                    max_depth=i,
                                    random_state=SEED)

    rfc2.fit(X_train, y_train)
    # Predict the test set labels
    y_pred = rfc2.predict(X_test)

    serie = pd.Series([i, mean_absolute_error(y_test, y_pred), mean_squared_error(
        y_test, y_pred), np.sqrt(mean_squared_error(y_test, y_pred))], index=['depth', 'mae', 'mse', 'rmse'])
    data.append(serie)

In [None]:
# Create a dataframe with the results
df_depth = pd.DataFrame(data)
df_depth.info()

In [None]:
# Plot the results
pd.options.plotting.backend = "plotly"
df_depth.plot.bar(x='depth', y=['mae', 'mse', 'rmse'], title='Random Forest Classifier',
            barmode='group', labels={'value': 'Error', 'depth': 'Depth'})

In [None]:
# De laagste error waarden:
df_depth.min()

# Conclusie:
# Een depth van 17 geeft op alle gemeten punten de laagste error "rate".

### Testen voor `n_estimators` Random Forest

In [None]:
data = []
n_estimators = np.arange(1, 100, 1)

for i in n_estimators:
    rfc3 = RandomForestClassifier(n_estimators=i,
                                 max_depth=17,
                                 random_state=SEED)

    # train the model
    rfc3.fit(X_train, y_train)
    # Predict the test set labels
    y_pred = rfc3.predict(X_test)

    # Create a series with the calculated metrics and append it to the data list
    serie = pd.Series([i, mean_absolute_error(y_test, y_pred), mean_squared_error(
        y_test, y_pred), np.sqrt(mean_squared_error(y_test, y_pred))], index=['n_estimators', 'mae', 'mse', 'rmse'])
    data.append(serie)

In [None]:
# Create a dataframe with the data list
df_n_estim = pd.DataFrame(data)
df_n_estim.info()

In [None]:
# Plot the dataframe
df_n_estim.plot(y=['mae', 'mse', 'rmse'], kind='line')

In [None]:
df_n_estim.min()

In [None]:
df_n_estim.idxmin()
# Conclussie: The best n_estimators value is 37