In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
pd.options.plotting.backend = "plotly"

In [3]:
# Load the dataset
df = pd.read_csv('../Video_games_esrb_rating.csv')
df.head()


Unnamed: 0,title,console,alcohol_reference,animated_blood,blood,blood_and_gore,cartoon_violence,crude_humor,drug_reference,fantasy_violence,...,sexual_content,sexual_themes,simulated_gambling,strong_janguage,strong_sexual_content,suggestive_themes,use_of_alcohol,use_of_drugs_and_alcohol,violence,esrb_rating
0,Monster Jam Steel Titans 2,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,E
1,Subnautica: Below Zero,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ET
2,NIER REPLICANT VER.1.22474487139…,1,0,0,1,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,M
3,Jamestown+,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,ET
4,Neptunia Virtual Stars,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,T


In [4]:
df.drop(['title'], inplace=True, axis=1)

In [5]:
# Convert the esrb_rating column to numeric
ratings = df['esrb_rating'].unique()
ratings

df['esrb_rating'] = df['esrb_rating'].replace(
    'E', 0).replace('ET', 1).replace('T', 2).replace('M', 3)

In [6]:
y = df['esrb_rating']
X = df.drop(['esrb_rating'], axis=1)

In [7]:
# Split the dataset into training and testing
SEED = 42
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.2,
                                                        random_state=SEED)

In [8]:
# Instantiate a random forest classifier
# Test different depths of the tree

data = []

for i in range(1, 20):
    rfc = RandomForestClassifier(n_estimators=10,
                                    max_depth=i,
                                    random_state=SEED)

    rfc.fit(X_train, y_train)
    # Predict the test set labels
    y_pred = rfc.predict(X_test)

    serie = pd.Series([i, mean_absolute_error(y_test, y_pred), mean_squared_error(
        y_test, y_pred), np.sqrt(mean_squared_error(y_test, y_pred))], index=['depth', 'mae', 'mse', 'rmse'])
    data.append(serie)

In [9]:
# Create a dataframe with the results
df = pd.DataFrame(data)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   depth   19 non-null     float64
 1   mae     19 non-null     float64
 2   mse     19 non-null     float64
 3   rmse    19 non-null     float64
dtypes: float64(4)
memory usage: 736.0 bytes


In [10]:
# Plot the results
df.plot.bar(x='depth', y=['mae', 'mse', 'rmse'], title='Random Forest Classifier',
            barmode='group', labels={'value': 'Error', 'depth': 'Depth'})