In [6]:
# Import the data

import pandas as pd

df = pd.read_csv('priest_popular_archetype_decks.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35867 entries, 0 to 35866
Columns: 1577 entries, deck_archetype to Murloc Holmes
dtypes: int64(1576), object(1)
memory usage: 431.5+ MB


In [7]:
df.head()

Unnamed: 0,deck_archetype,Circle of Healing,Flash Heal,Northshire Cleric,Power Word: Shield,Embrace the Shadow,Mind Blast,Shadow Word: Death,Shadow Word: Pain,Auchenai Soulpriest,...,Coilfang Constrictor,Snapdragon,Neptulon the Tidehunter,Ozumat,Prince Renathal,Ethereal Augmerchant,Replicat-o-tron,Cathedral of Atonement,Dispossessed Soul,Murloc Holmes
0,Control Priest,2,2,2,2,2,2,1,1,2,...,0,0,0,0,0,0,0,0,0,0
1,Dragon Priest,0,0,2,2,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Control Priest,2,0,2,2,0,0,2,0,2,...,0,0,0,0,0,0,0,0,0,0
3,Dragon Priest,0,0,2,2,0,0,2,1,0,...,0,0,0,0,0,0,0,0,0,0
4,C'Thun Priest,0,0,2,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# Split the data for training

from sklearn.model_selection import train_test_split

X = df.drop(['deck_archetype'], axis=1) # Features (cards vectorized)
y = df['deck_archetype'] # Target (deck archetype to predict)

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [41]:
# Train the model

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [3000], # Higher the number, higher the accuracy
    'max_depth': [50], # Deeper trees can capture complex patterns but risk overfitting
    'min_samples_split': [10], # Minimum samples needed to split a node
    'min_samples_leaf': [2], # Minimum samples required at each leaf
    'max_features': ['sqrt'] # Number of features to consider for each split
}

rfc = RandomForestClassifier(random_state=42) # The answer of the universe
grid_search = GridSearchCV(rfc, param_grid, cv=3, scoring='precision', n_jobs=-1)
grid_search.fit(x_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_
print("Best Model Parameters:", grid_search.best_params_)




Best Model Parameters: {'max_depth': 50, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 3000}


In [43]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = best_model.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7283245051575132
                   precision    recall  f1-score   support

       Big Priest       0.71      0.79      0.74       242
    C'Thun Priest       0.88      0.95      0.91       303
     Combo Priest       0.65      0.41      0.50       441
   Control Priest       0.68      0.70      0.69      1583
    Dragon Priest       0.80      0.88      0.84      1154
Highlander Priest       0.72      0.75      0.73       292
    N'Zoth Priest       0.68      0.77      0.72       812
       OTK Priest       0.68      0.53      0.59       563
     Quest Priest       0.75      0.80      0.77       288
      Reno Priest       0.74      0.79      0.76       547
 Resurrect Priest       0.68      0.43      0.53       353
   Silence Priest       0.74      0.75      0.75       348
  Spiteful Priest       0.87      0.88      0.88       248

         accuracy                           0.73      7174
        macro avg       0.74      0.72      0.72      7174
     weighted avg       0

In [44]:
# Export the model

import joblib
joblib.dump(best_model, 'best_random_forest_model.pkl')

['best_random_forest_model.pkl']