In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('cr_rf_sample_100k.csv')

top_wincons = ['26000058',  # Wall Breakers
               '26000029',  # Lava Hound
               '26000024',  # Royal Giant
               '26000021',  # Hog Rider
               '26000055',  # Mega Knight
               '28000004']  # Goblin Barrel

all_wincons = ['26000056',
 '27000002',
 '26000024',
 '26000067',
 '26000021',
 '26000036',
 '26000003',
 '26000059',
 '26000028',
 '26000058',
 '28000004',
 '26000006',
 '27000008',
 '26000060',
 '26000009',
 '26000085',
 '26000032',
 '26000051',
 '28000010',
 '26000029',
 '26000055',
 '26000015',
 '26000016',
 '26000020']

display(df.head())

Unnamed: 0,label,arena_id,avg_start_trophies,player_start_trophies,player_trophy_change,player_crowns,elixir_avg,troop_cnt,structure_cnt,spell_cnt,common_cnt,rare_cnt,epic_cnt,legendary_cnt,trophy_bin,cards_str
0,1,54000049,5516.0,5517.0,29.0,3.0,3.75,6,0,2,2,0,2,4,bin_7,26000041|26000022|26000012|28000004|26000055|2...
1,1,54000049,5471.5,5470.0,30.0,2.0,3.5,7,0,1,4,2,2,0,bin_6,26000019|26000000|26000013|28000001|26000011|2...
2,1,54000050,5395.5,5390.0,30.0,3.0,4.25,6,0,2,0,0,6,2,bin_6,26000009|28000015|26000035|26000023|26000015|2...
3,1,54000050,5348.0,5365.0,27.0,3.0,4.125,6,1,1,3,3,0,2,bin_6,26000029|26000057|26000039|28000001|26000080|2...
4,1,54000050,5240.5,5228.0,32.0,3.0,3.25,7,0,1,4,0,1,3,bin_6,26000049|26000037|26000055|26000032|26000056|2...


Quick EDA/Prepparing the Data Set for the Model

In [24]:
num_rows, num_columns = df.shape

print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_columns}")
print()

column_names = df.columns.tolist()
print("Column names:")
print(column_names)
print()

col = df['player_start_trophies']
print(col.describe())

Number of rows: 99706
Number of columns: 16

Column names:
['label', 'arena_id', 'avg_start_trophies', 'player_start_trophies', 'player_trophy_change', 'player_crowns', 'elixir_avg', 'troop_cnt', 'structure_cnt', 'spell_cnt', 'common_cnt', 'rare_cnt', 'epic_cnt', 'legendary_cnt', 'trophy_bin', 'cards_str']

count    99706.000000
mean      4659.892243
std        392.101112
min       3499.000000
25%       4388.000000
50%       4653.000000
75%       4912.000000
max       5786.000000
Name: player_start_trophies, dtype: float64


In [26]:
#dropping unnecessary columns
columns_to_drop = ['arena_id', 'avg_start_trophies', 'player_trophy_change', 'player_crowns', 'troop_cnt', 'structure_cnt', 'spell_cnt', 'common_cnt', 'rare_cnt', 'epic_cnt']
df_cleaned = df.drop(columns=columns_to_drop)

#narrowing focus to only bin_4 (4486-4938)
df_filtered = df_cleaned[df_cleaned['trophy_bin'] == 'bin_4']
df_filtered = df_filtered.copy()
df_filtered

Unnamed: 0,label,player_start_trophies,elixir_avg,legendary_cnt,trophy_bin,cards_str
9,1,4803.0,3.375000,3,bin_4,26000055|26000049|28000001|26000041|26000032|2...
11,1,4871.0,3.250000,3,bin_4,26000055|28000008|26000037|26000032|26000049|2...
12,1,4864.0,4.125000,1,bin_4,26000009|28000015|26000063|26000043|28000012|2...
13,1,4868.0,4.125000,2,bin_4,26000003|28000010|28000008|26000018|26000049|2...
19,1,4861.0,3.625000,1,bin_4,26000036|26000012|26000040|26000004|28000008|2...
...,...,...,...,...,...,...
99694,0,4807.0,4.625000,1,bin_4,26000018|26000007|26000015|26000062|28000000|2...
99697,0,4838.0,4.250000,0,bin_4,26000021|26000017|26000022|28000002|27000010|2...
99698,0,4910.0,3.500000,1,bin_4,26000021|26000049|28000002|26000012|26000026|2...
99700,0,4883.0,4.500000,1,bin_4,26000024|26000017|26000022|28000008|28000011|2...


In [28]:
#checking ranges and counts
min_trophy = df_filtered['player_start_trophies'].min()
max_trophy = df_filtered['player_start_trophies'].max()

num_rows = len(df_filtered)
print(f"Total rows: {num_rows}")
print(f"Minimum trophies: {min_trophy}")
print(f"Maximum trophies: {max_trophy}")

Total rows: 26954
Minimum trophies: 4486.0
Maximum trophies: 4938.0


In [30]:
#splitting the cards into a list
df_filtered['cards_list'] = df_filtered['cards_str'].apply(lambda x: x.split('|'))
df_filtered_wincons = df_filtered[df_filtered['cards_list'].apply(lambda cards: any(wc in cards for wc in top_wincons))]

df2 = df_filtered_wincons.drop('cards_str', axis=1)
df2

Unnamed: 0,label,player_start_trophies,elixir_avg,legendary_cnt,trophy_bin,cards_list
9,1,4803.0,3.375000,3,bin_4,"[26000055, 26000049, 28000001, 26000041, 26000..."
11,1,4871.0,3.250000,3,bin_4,"[26000055, 28000008, 26000037, 26000032, 26000..."
21,1,4696.0,3.750000,3,bin_4,"[26000046, 26000055, 28000008, 26000042, 28000..."
25,1,4771.0,4.500000,0,bin_4,"[26000024, 26000022, 27000010, 27000003, 28000..."
28,1,4914.0,3.125000,2,bin_4,"[26000021, 27000003, 28000004, 26000010, 26000..."
...,...,...,...,...,...,...
99670,0,4917.0,4.125000,2,bin_4,"[26000011, 26000062, 26000021, 26000042, 28000..."
99697,0,4838.0,4.250000,0,bin_4,"[26000021, 26000017, 26000022, 28000002, 27000..."
99698,0,4910.0,3.500000,1,bin_4,"[26000021, 26000049, 28000002, 26000012, 26000..."
99700,0,4883.0,4.500000,1,bin_4,"[26000024, 26000017, 26000022, 28000008, 28000..."


In [31]:
#turning card list into binary matrix for random forest
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
feature_matrix_cards = pd.DataFrame(mlb.fit_transform(df2['cards_list']),
                       columns=mlb.classes_,
                       index=df2.index)

num_rows, num_columns = feature_matrix_cards.shape

print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_columns}")
print()

Number of rows: 15652
Number of columns: 102



In [34]:
feature_matrix_cards

Unnamed: 0,26000000,26000001,26000002,26000003,26000004,26000005,26000006,26000007,26000008,26000009,...,28000009,28000010,28000011,28000012,28000013,28000014,28000015,28000016,28000017,28000018
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99670,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
99697,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99698,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99700,0,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0


In [36]:
#checking for any empty decks
empty_decks = (feature_matrix_cards.sum(axis=1) == 0)
empty_decks.sum()

0

Starting to Build the Model

In [39]:
#adding extra features for model to get better predictions
numeric_cols = ['elixir_avg', 'legendary_cnt']
numeric_features = df2[numeric_cols]

feature_matrix = pd.concat([feature_matrix_cards, numeric_features], axis=1)
wins = df2['label']

feature_matrix

Unnamed: 0,26000000,26000001,26000002,26000003,26000004,26000005,26000006,26000007,26000008,26000009,...,28000011,28000012,28000013,28000014,28000015,28000016,28000017,28000018,elixir_avg,legendary_cnt
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3.375000,3
11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3.250000,3
21,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3.750000,3
25,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4.500000,0
28,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,3.125000,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99670,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,4.125000,2
99697,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,4.250000,0
99698,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3.500000,1
99700,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,4.500000,1


In [41]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

#X - features, Y - win or loss. 20% testing , 80% actual
X_train, X_test, y_train, y_test = train_test_split(
    feature_matrix, wins, test_size=0.2, random_state=42
)

rf = RandomForestClassifier(
    n_estimators=100,  # number of trees
    max_depth=12,
    random_state=42,
    n_jobs=-1          # use all CPU cores
)

#training model
rf.fit(X_train, y_train)

#prediction model
y_pred = rf.predict(X_test)

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Detailed classification report
print(classification_report(y_test, y_pred))

Accuracy: 0.5046311082721175
              precision    recall  f1-score   support

           0       0.51      0.36      0.42      1576
           1       0.50      0.65      0.56      1555

    accuracy                           0.50      3131
   macro avg       0.51      0.51      0.49      3131
weighted avg       0.51      0.50      0.49      3131



In [42]:
#finding the most important influences for the ML model

importances = rf.feature_importances_

# Map to feature names
feature_importance_df = pd.DataFrame({
    'feature': feature_matrix.columns,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print(feature_importance_df.head(10))
feature_importance_df.to_csv('feature_importance.csv', sep=',', index=False)

           feature  importance
102     elixir_avg    0.083444
103  legendary_cnt    0.044308
11        26000011    0.018518
17        26000017    0.018356
21        26000021    0.016930
91        28000008    0.016694
55        26000055    0.015751
83        28000000    0.015516
49        26000049    0.015498
12        26000012    0.015455


In [55]:
df3

Unnamed: 0,label,player_start_trophies,elixir_avg,legendary_cnt,trophy_bin,cards_list
9,1,4803.0,3.375000,3,bin_4,"[26000055, 26000049, 28000001, 26000041, 26000..."
11,1,4871.0,3.250000,3,bin_4,"[26000055, 28000008, 26000037, 26000032, 26000..."
12,1,4864.0,4.125000,1,bin_4,"[26000009, 28000015, 26000063, 26000043, 28000..."
13,1,4868.0,4.125000,2,bin_4,"[26000003, 28000010, 28000008, 26000018, 26000..."
19,1,4861.0,3.625000,1,bin_4,"[26000036, 26000012, 26000040, 26000004, 28000..."
...,...,...,...,...,...,...
99694,0,4807.0,4.625000,1,bin_4,"[26000018, 26000007, 26000015, 26000062, 28000..."
99697,0,4838.0,4.250000,0,bin_4,"[26000021, 26000017, 26000022, 28000002, 27000..."
99698,0,4910.0,3.500000,1,bin_4,"[26000021, 26000049, 28000002, 26000012, 26000..."
99700,0,4883.0,4.500000,1,bin_4,"[26000024, 26000017, 26000022, 28000008, 28000..."


In [58]:
#for SHAP
df_filtered['cards_list'] = df_filtered['cards_str'].apply(lambda x: x.split('|'))
df3 = df_filtered.drop('cards_str', axis=1)

mlb = MultiLabelBinarizer()
feature_matrix_all_cards = pd.DataFrame(mlb.fit_transform(df3['cards_list']),
                       columns=mlb.classes_,
                       index=df3.index)

num_rows, num_columns = feature_matrix_all_cards.shape

print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_columns}")
print()

Number of rows: 26954
Number of columns: 102



In [88]:
X_wincon = feature_matrix_all_cards  # or just all cards
y = df3['label']  # your target

rf_small = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42)
rf_small.fit(X_wincon, y)

X_corr = X_wincon[y==1]  # optional: only winning decks
synergy_matrix = X_corr.corr().values
np.fill_diagonal(synergy_matrix, 0)  # zero self-correlation

cards = X_corr.columns
synergy_list = []

for i in range(len(cards)):
    for j in range(i+1, len(cards)):
        synergy_list.append((cards[i], cards[j], synergy_matrix[i, j]))

synergy_df = pd.DataFrame(synergy_list, columns=["Card1", "Card2", "SynergyScore"])
synergy_df = synergy_df.sort_values(by="SynergyScore", ascending=False)

print(synergy_df.head(20))

         Card1     Card2  SynergyScore
911   26000009  26000048      0.423890
3167  26000038  27000000      0.394544
992   26000010  26000038      0.388959
984   26000010  26000030      0.348808
1025  26000010  27000000      0.341452
2635  26000030  27000000      0.312029
1905  26000020  28000013      0.311937
2602  26000030  26000038      0.301202
2361  26000026  28000004      0.276986
2658  26000030  28000011      0.273607
1033  26000010  27000008      0.260207
4719  27000001  27000005      0.234230
3514  26000044  26000061      0.232219
4852  27000006  27000008      0.228608
4412  26000063  26000068      0.225434
1379  26000014  27000000      0.217847
1048  26000010  28000011      0.216567
4785  27000003  28000004      0.208838
3015  26000036  26000046      0.200415
669   26000006  28000002      0.197901


In [92]:
synergy_wincon = synergy_df[
    (synergy_df['Card1'].isin(top_wincons)) | (synergy_df['Card2'].isin(top_wincons))
].sort_values(by='SynergyScore', ascending=False)

synergy_wincon

Unnamed: 0,Card1,Card2,SynergyScore
2361,26000026,28000004,0.276986
4785,27000003,28000004,0.208838
3366,26000041,28000004,0.192765
2761,26000032,26000058,0.188864
2573,26000029,27000009,0.188839
...,...,...,...
5016,28000002,28000004,-0.107431
351,26000003,26000055,-0.108285
1945,26000021,26000056,-0.110375
605,26000006,26000021,-0.127208
