In [None]:
%pip install numpy==1.24.1
%pip install pandas==1.5.3
%pip install seaborn==0.12.2
%pip install scikit-learn==1.2.0
%pip install plotly==5.12.0
%pip install matplotlib==3.6.3

# Random Forest

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import tree
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, classification_report, confusion_matrix, multilabel_confusion_matrix, ConfusionMatrixDisplay

In [None]:
pd.options.plotting.backend = "plotly"

## CSV dataset inladen

In [None]:
df = pd.read_csv('../Video_games_esrb_rating.csv')
df.head()

In [None]:
df.info()

## Preprocessing

In [None]:
# Describe the dataset
df.describe()
df.drop(['title'], inplace=True, axis=1)

In [None]:
ratings = df['esrb_rating'].unique()
ratings

In [None]:
df.head()

In [None]:
df['esrb_rating'] = df['esrb_rating'].replace(
    'E', 0).replace('ET', 1).replace('T', 2).replace('M', 3)

In [None]:
y = df['esrb_rating']
X = df.drop(['esrb_rating'], axis=1)

## Split dataset in training en test

In [None]:
# Split the data into training and test sets
SEED = 42
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=SEED)

## RandomForestClassifier

In [None]:
# Instantiate a random forest classifier
rfc = RandomForestClassifier(n_estimators=20,
                             max_depth=11,
                             random_state=SEED)


In [None]:
# Train the classifier
rfc.fit(X_train, y_train)
# Predict the labels of the test set
y_pred = rfc.predict(X_test)

## Visualizeer alle random decision trees 

In [None]:

features = X.columns.values  # The name of each column
classes = ['Everyone', 'Early Teen', 'Teen', 'Mature']  # The name of each class

for estimator in rfc.estimators_:
    print(estimator)
    plt.figure(figsize=(50, 30))
    tree.plot_tree(estimator,
                   feature_names=features,
                   class_names=classes,
                   fontsize=8,
                   filled=True,
                   rounded=True)
    plt.show()

## Bereken model nauwkeurigheid

In [None]:
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens').set_title('ESRB Rating')

print(classification_report(y_test,y_pred))

In [None]:
# Calculate the errors / accuracy
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))


### Confusion matrix voor elke van de 4 classes binnen het model

In [None]:
mcm = multilabel_confusion_matrix(y_test, y_pred)

for confusion_matrix in mcm:
    disp = ConfusionMatrixDisplay(confusion_matrix, display_labels=['T','F'])
    disp.plot(include_values=True, cmap="Greens", ax=None, xticks_rotation="vertical")
    plt.show()

## Lijst van features gesorteerd op belangerijkheid

In [None]:
# Create a dataframe with the features and their importance

features_df = pd.DataFrame(
    {'features': rfc.feature_names_in_, 'importances': rfc.feature_importances_})

# Sorting data from highest to lowest
features_df_sorted = features_df.sort_values(by='importances', ascending=False)

# Barplot of the result without borders and axis lines
g = sns.barplot(data=features_df_sorted, x='importances',
                y='features', palette="rocket")
sns.despine(bottom=True, left=True)
g.set_title('Feature importances')
g.set(xlabel=None)
g.set(ylabel=None)
g.set(xticks=[])
for value in g.containers:
    g.bar_label(value, padding=20)
    g.margins(y=0.005)