In [18]:
from data import Database
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score

In [19]:
df = Database().dataframe()

In [20]:
X = df[['Level', 'Health', 'Energy', 'Sanity']]
y = df['Rarity']

In [21]:
X.describe()

Unnamed: 0,Level,Health,Energy,Sanity
count,1020.0,1020.0,1020.0,1020.0
mean,7.956863,40.260196,40.308108,40.267461
std,4.481573,34.754796,34.862985,34.811319
min,1.0,1.12,1.09,1.61
25%,4.0,16.09,16.195,16.0275
50%,7.0,28.46,28.965,28.63
75%,11.0,54.0775,54.225,53.8675
max,20.0,187.26,190.75,186.36


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
model_rf = RandomForestClassifier(n_jobs=-1, random_state=42)

parameters_rf = {
    'n_estimators': [25, 50, 75, 100, 150, 200],
    'max_depth': [10, 15, 30, 50, 80],
    'max_samples': [0.5, 0.75, 0.8, 0.9]
}

search_rf = RandomizedSearchCV(model_rf, parameters_rf, n_iter=5, cv=5, random_state=42)
search_rf.fit(X_train, y_train)
model_rf_ = search_rf.best_estimator_
print(search_rf.best_params_)

{'n_estimators': 75, 'max_samples': 0.9, 'max_depth': 15}


In [24]:
accuracy_score(y_test, model_rf_.predict(X_test))

0.9607843137254902

In [25]:
model_gb = GradientBoostingClassifier(random_state=42)

parameters_gb = {
    'n_estimators': [25, 50, 75, 100, 150, 200],
    'max_depth': [10, 15, 30, 50, 80]
}

search_gb = RandomizedSearchCV(model_gb, parameters_gb, n_iter=5, cv=5, random_state=42)
search_gb.fit(X_train, y_train)
model_gb_ = search_gb.best_estimator_
print(search_gb.best_params_)

{'n_estimators': 75, 'max_depth': 15}


In [26]:
accuracy_score(y_test, model_gb_.predict(X_test))

0.9117647058823529

In [27]:
model_r = RidgeClassifier(random_state=42)

parameters_r = {
    'alpha': [1, 5, 10, 25, 50, 75, 100, 150, 200],
    'max_iter': [10, 15, 30, 50, 80]
}

search_r = RandomizedSearchCV(model_r, parameters_r, n_iter=5, cv=5, random_state=42)
search_r.fit(X_train, y_train)
model_r_ = search_r.best_estimator_
print(search_r.best_params_)

{'max_iter': 10, 'alpha': 75}


In [28]:
accuracy_score(y_test, model_r_.predict(X_test))

0.4852941176470588

    The model with the best accuracy after tuning was the Random Forest Classifier model. This is an ensemble model made of decision trees that uses bagging. Decision trees are a weaker model that arrive at a prediction by going through a series of splits called leaf nodes that evaluate the sample. In an ensemble model of decision trees, a collection of decision trees are trained to better fit the data. Bagging is a method used in ensemble training where each weaker model can be trained in parallel using different samples of the dataset. I tuned my model using a randomized parameter search, which tests several given parameter combinations for accuracy. I tested for 'n_estimators' (the amount of weak learners used), 'max_depth' (the maximum depth of leaf nodes each weak learner is allowed to reach), and 'max_samples' (the maximum percentage of samples from the dataset each weak learner can be trained on). It determined that the most accurate Random Forest model most likely has these parameters: 75 'n_estimators', 90% 'max_samples', and 'max_depth' of 15.