In [6]:
# first let's get the data import/cleanup out of the way
import pandas as pd
import json
import os

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics, svm
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [19]:
columns = [
    "is_advanced",
    "is_melee",
    "average_damage",
    "hands_to_use",
    "bulk",
    "cost",
    "number_of_traits",
]
df = pd.DataFrame(columns = columns)

directory = os.fsencode("equipment")

i = 0

for file in os.listdir(directory):
    filename = os.fsdecode(file)
    with open(f"equipment\\{filename}", 'r') as f:
        try:
            data = json.load(f)
        except UnicodeDecodeError:
            continue
    
    try:
        name = data['name']
        if data['type'] != 'weapon':
            continue

        if name.lower().replace(' ', '-') != data['system']['baseItem']:
            continue

        if 'magical' in data['system']['traits']['value']:
            continue
        # okay we weeded out what we don't need, everything else is the base items

        data = data['system']  # as everything is in here honestly
        # now let's put it in the dataframe
        line = []  # "is_advanced", "is_melee", "average_damage", "hands_to_use", "bulk", "cost", "number_of_traits"

        # is_advanced
        if data['category'] == 'advanced':
            line.append(1)
        else:
            line.append(0)

        # is_melee
        if data['group'] in ['bomb', 'bow', 'crossbow', 'dart', 'firearm', 'sling']:
            line.append(0)
        else:
            line.append(1)
        
        # average_damage
        die_amount = data['damage']['dice']
        if data['damage']['die']:
            die_size = int(data['damage']['die'][1:])
        else:
            die_size = 0
        average_damage = die_amount * (die_size / 2 + 0.5)
        line.append(average_damage)

        # hands_to_use
        if 'two-hand' in data['usage']['value']:
            line.append(2)
        else:
            line.append(1)

        # bulk
        if data['bulk']['value'] == 0.1:
            line.append(0)
        else:
            line.append(data['bulk']['value'])

        # cost
        price = data['price']['value']
        cost = price.get('pp', 0) * 10 + price.get('gp', 0) + price.get('sp', 0) * 0.1 + price.get('cp', 0) * 0.01
        line.append(cost)

        # number of traits
        line.append(len(data['traits']['value']))

        df.loc[i] = line
        i += 1
    except Exception as e:
        print(name)
        raise(e)

df2 = df
df = df[df['cost'] != 90]
df

Unnamed: 0,is_advanced,is_melee,average_damage,hands_to_use,bulk,cost,number_of_traits
0,0.0,1.0,5.5,2.0,2.0,1.0,3.0
1,0.0,0.0,2.5,1.0,0.0,4.0,2.0
2,1.0,1.0,3.5,1.0,1.0,5.0,4.0
3,0.0,0.0,4.5,2.0,1.0,25.0,1.0
4,1.0,1.0,4.5,1.0,1.0,2.0,2.0
...,...,...,...,...,...,...,...
261,0.0,1.0,2.5,1.0,0.0,2.0,6.0
262,1.0,0.0,3.5,2.0,2.0,8.0,4.0
263,0.0,0.0,2.5,1.0,1.0,3.0,3.0
264,0.0,1.0,3.5,1.0,0.0,1.0,4.0


In [26]:
X = df[[
    "is_melee",
    "average_damage",
    "hands_to_use",
    "bulk",
    "cost",
    "number_of_traits",
]]
y = df['is_advanced']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 101)

In [None]:
# df2 has one of the big outlier values I otherwise removed, this is a test of how resilient SVM is to outliers
X = df2[[
    "is_melee",
    "average_damage",
    "hands_to_use",
    "bulk",
    "cost",
    "number_of_traits",
]]
y = df2['is_advanced']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 101)

In [27]:
# let's train up the model

model = make_pipeline(StandardScaler(), svm.SVC(probability=True))
model.fit(X_train, y_train)

In [28]:
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))

acc = accuracy_score(y_test, predictions)
print("\nModel overall accuracy: {:.2f}%\n".format(acc * 100))

              precision    recall  f1-score   support

         0.0       0.84      1.00      0.91        66
         1.0       1.00      0.07      0.13        14

    accuracy                           0.84        80
   macro avg       0.92      0.54      0.52        80
weighted avg       0.86      0.84      0.77        80


Model overall accuracy: 83.75%



In [None]:
# I was at first confused as in my first test
# on it, it had the same accuracy
# but quickly saw that it is just an accident as it has different values on some outcomes

# however this still clearly points at my poor choice of dataset, as I hit a wall with machine learning models

# wanting to try outlier values I added it back and the accuaricy barely dropped (less than 5% change which is nothing on this size)

roc_auc_score(y_test.values, model.predict_proba(X_test)[:, 1])
# same difference on the roc auc score

np.float64(0.8262987012987013)

final thoughts

I personally did not see much difference between logistic regression and svc

this mostly comes down to my poor choice of data (I'm just not good at data analytics, it's not my field)