In [1]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
import numpy as np

In [2]:
# Load the jsonl data
def load_jsonl(input_path):
    with open(input_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        data = [json.loads(line.strip()) for line in lines]
    return data


def hellinger_distance(p, q):
    return np.sqrt(np.sum((np.sqrt(p) - np.sqrt(q)) ** 2)) / np.sqrt(2)


def labels_to_probs(labels, preds):
    label_dict = {'handling': 0, 'strength': 0, 'deception': 0, 'performance': 0, 'arcana': 0, 'athletics': 0, 'nature': 0, 'insight': 0, 'survival': 0, 'persuasion': 0, 'history': 0,
                  'perception': 0, 'charisma': 0, 'intimidation': 0, 'acrobatics': 0, 'intelligence': 0, 'dexterity': 0, 'investigation': 0, 'stealth': 0, 'wisdom': 0, 'hand': 0, 'religion': 0, 'medicine': 0}

    pred_dict = {'handling': 0, 'strength': 0, 'deception': 0, 'performance': 0, 'arcana': 0, 'athletics': 0, 'nature': 0, 'insight': 0, 'survival': 0, 'persuasion': 0, 'history': 0,
                  'perception': 0, 'charisma': 0, 'intimidation': 0, 'acrobatics': 0, 'intelligence': 0, 'dexterity': 0, 'investigation': 0, 'stealth': 0, 'wisdom': 0, 'hand': 0, 'religion': 0, 'medicine': 0}

    for label in labels:
        label_dict[label] += 1

    for pred in preds:
        pred_dict[pred] += 1

    label_probs = [label_dict[label] /
                   len(labels) for label in sorted(label_dict.keys())]
    pred_probs = [pred_dict[pred] / len(preds)
                  for pred in sorted(pred_dict.keys())]

    return label_probs, pred_probs


train_data = load_jsonl('data/train.jsonl')
val_data = load_jsonl('data/test.jsonl')
test_data = load_jsonl('test_cleaned.jsonl')

train_texts = [item['input'] for item in train_data]
train_labels = [item['target'] for item in train_data]
val_texts = [item['input'] for item in val_data]
val_labels = [item['target'] for item in val_data]

test_texts = [item['input'] for item in test_data]
test_intents = [item['intent'] for item in test_data]
test_labels = [item['skill'][0] for item in test_data]
test_target = [item['target'] for item in test_data]


In [3]:
# Encode labels from strings to integers
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
val_labels_encoded = label_encoder.transform(val_labels)
test_labels_encoded = label_encoder.transform(test_labels)

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Adjust the max_features and other parameters as needed

# Fit the vectorizer on training data and transform training and validation texts
train_features = vectorizer.fit_transform(train_texts)
val_features = vectorizer.transform(val_texts)

# Logistic Regression

In [4]:
# Train a Logistic Regression classifier
lr_clf = LogisticRegression(max_iter=10000)  # Adjust the max_iter and other parameters as needed
lr_clf.fit(train_features, train_labels_encoded)

# Predict on validation set
val_predictions_encoded = lr_clf.predict(val_features)

test_predictions_encoded = lr_clf.predict(vectorizer.transform(test_texts))
test_predictions_encoded_intent = lr_clf.predict(vectorizer.transform(test_intents))
test_predictions_encoded_with_intent = lr_clf.predict(vectorizer.transform([f'{text} {intent}' for text, intent in zip(test_texts, test_intents)]))
test_target_pred_encoded = lr_clf.predict(vectorizer.transform(test_target))
test_target_with_intent_encoded = lr_clf.predict(vectorizer.transform([f'{text} {intent}' for text, intent in zip(test_target, test_intents)]))

# Decode the predictions back to original labels
val_predictions = label_encoder.inverse_transform(val_predictions_encoded)

test_predictions = label_encoder.inverse_transform(test_predictions_encoded)
test_predictions_intent = label_encoder.inverse_transform(test_predictions_encoded_intent)
test_predictions_with_intent = label_encoder.inverse_transform(test_predictions_encoded_with_intent)
test_target_pred = label_encoder.inverse_transform(test_target_pred_encoded)
test_target_with_intent = label_encoder.inverse_transform(test_target_with_intent_encoded)

total = 0
correct = 0
for i in range(len(val_labels)):
    if val_labels[i] == val_predictions[i]:
        correct += 1
    total += 1
# Print accuracy percentage rounded to 2 decimal points
print("Logistic regression accuracy: {:.2f}%".format(correct / total * 100))
label_prob, pred_prob = labels_to_probs(val_labels, val_predictions)
print("Hellinger distance: {:.2f}".format(hellinger_distance(label_prob, pred_prob)))

total = 0
correct = 0
for i in range(len(test_labels)):
    if test_labels[i] == test_predictions[i]:
        correct += 1
    total += 1
# Print accuracy percentage rounded to 2 decimal points
print("Logistic regression accuracy (test text only): {:.2f}%".format(correct / total * 100))
label_prob, pred_prob = labels_to_probs(test_labels, test_predictions)
print("Hellinger distance: {:.2f}".format(hellinger_distance(label_prob, pred_prob)))

total = 0
correct = 0
for i in range(len(test_labels)):
    if test_labels[i] == test_predictions_intent[i]:
        correct += 1
    total += 1
# Print accuracy percentage rounded to 2 decimal points
print("Logistic regression accuracy (test intent only): {:.2f}%".format(correct / total * 100))
label_prob, pred_prob = labels_to_probs(test_labels, test_predictions_intent)
print("Hellinger distance: {:.2f}".format(hellinger_distance(label_prob, pred_prob)))

total = 0
correct = 0
for i in range(len(test_labels)):
    if test_labels[i] == test_predictions_with_intent[i]:
        correct += 1
    total += 1
# Print accuracy percentage rounded to 2 decimal points
print("Logistic regression accuracy (text + intent): {:.2f}%".format(correct / total * 100))
label_prob, pred_prob = labels_to_probs(test_labels, test_predictions_with_intent)
print("Hellinger distance: {:.2f}".format(hellinger_distance(label_prob, pred_prob)))

total = 0
correct = 0
for i in range(len(test_labels)):
    if test_labels[i] == test_target_pred[i]:
        correct += 1
    total += 1
# Print accuracy percentage rounded to 2 decimal points
print("Logistic regression accuracy (target): {:.2f}%".format(correct / total * 100))
label_prob, pred_prob = labels_to_probs(test_labels, test_target_pred)
print("Hellinger distance: {:.2f}".format(hellinger_distance(label_prob, pred_prob)))

total = 0
correct = 0
for i in range(len(test_labels)):
    if test_labels[i] == test_target_with_intent[i]:
        correct += 1
    total += 1
# Print accuracy percentage rounded to 2 decimal points
print("Logistic regression accuracy (target + intent): {:.2f}%".format(correct / total * 100))
label_prob, pred_prob = labels_to_probs(test_labels, test_target_with_intent)
print("Hellinger distance: {:.2f}".format(hellinger_distance(label_prob, pred_prob)))

Logistic regression accuracy: 46.77%
Hellinger distance: 0.38
Logistic regression accuracy (test text only): 23.73%
Hellinger distance: 0.57
Logistic regression accuracy (test intent only): 35.12%
Hellinger distance: 0.49
Logistic regression accuracy (text + intent): 25.14%
Hellinger distance: 0.58
Logistic regression accuracy (target): 30.73%
Hellinger distance: 0.47
Logistic regression accuracy (target + intent): 34.01%
Hellinger distance: 0.47


# Naive Bayes

In [5]:
# Train a Multinomial Naive Bayes classifier
nb_clf = MultinomialNB()
nb_clf.fit(train_features, train_labels_encoded)

# Predict on validation set
val_predictions_encoded = nb_clf.predict(val_features)

test_predictions_encoded = nb_clf.predict(vectorizer.transform(test_texts))
test_predictions_encoded_intent = nb_clf.predict(vectorizer.transform(test_intents))
test_predictions_encoded_with_intent = nb_clf.predict(vectorizer.transform([f'{text} {intent}' for text, intent in zip(test_texts, test_intents)]))
test_target_pred_encoded = nb_clf.predict(vectorizer.transform(test_target))
test_target_with_intent_encoded = nb_clf.predict(vectorizer.transform([f'{text} {intent}' for text, intent in zip(test_target, test_intents)]))

# Decode the predictions back to original labels
val_predictions = label_encoder.inverse_transform(val_predictions_encoded)

test_predictions = label_encoder.inverse_transform(test_predictions_encoded)
test_predictions_intent = label_encoder.inverse_transform(test_predictions_encoded_intent)
test_predictions_with_intent = label_encoder.inverse_transform(test_predictions_encoded_with_intent)
test_target_pred = label_encoder.inverse_transform(test_target_pred_encoded)
test_target_with_intent = label_encoder.inverse_transform(test_target_with_intent_encoded)

total = 0
correct = 0
for i in range(len(val_labels)):
    if val_labels[i] == val_predictions[i]:
        correct += 1
    total += 1
print("Naive Bayes accuracy: {:.2f}%".format(correct / total * 100))
label_prob, pred_prob = labels_to_probs(val_labels, val_labels)
print("Hellinger distance: {:.2f}".format(hellinger_distance(label_prob, pred_prob)))

total = 0
correct = 0
for i in range(len(test_labels)):
    if test_labels[i] == test_predictions[i]:
        correct += 1
    total += 1
# Print accuracy percentage rounded to 2 decimal points
print("Naive Bayes accuracy (test text only): {:.2f}%".format(correct / total * 100))
label_prob, pred_prob = labels_to_probs(test_labels, test_predictions)
print("Hellinger distance: {:.2f}".format(hellinger_distance(label_prob, pred_prob)))

total = 0
correct = 0
for i in range(len(test_labels)):
    if test_labels[i] == test_predictions_intent[i]:
        correct += 1
    total += 1
# Print accuracy percentage rounded to 2 decimal points
print("Naive Bayes accuracy (test intent only): {:.2f}%".format(correct / total * 100))
label_prob, pred_prob = labels_to_probs(test_labels, test_predictions_intent)
print("Hellinger distance: {:.2f}".format(hellinger_distance(label_prob, pred_prob)))

total = 0
correct = 0
for i in range(len(test_labels)):
    if test_labels[i] == test_predictions_with_intent[i]:
        correct += 1
    total += 1
# Print accuracy percentage rounded to 2 decimal points
print("Naive Bayes accuracy (text + intent): {:.2f}%".format(correct / total * 100))
label_prob, pred_prob = labels_to_probs(test_labels, test_predictions_with_intent)
print("Hellinger distance: {:.2f}".format(hellinger_distance(label_prob, pred_prob)))

total = 0
correct = 0
for i in range(len(test_labels)):
    if test_labels[i] == test_target_pred[i]:
        correct += 1
    total += 1
# Print accuracy percentage rounded to 2 decimal points
print("Naive Bayes accuracy (target): {:.2f}%".format(correct / total * 100))
label_prob, pred_prob = labels_to_probs(test_labels, test_target_pred)
print("Hellinger distance: {:.2f}".format(hellinger_distance(label_prob, pred_prob)))

total = 0
correct = 0
for i in range(len(test_labels)):
    if test_labels[i] == test_target_with_intent[i]:
        correct += 1
    total += 1
# Print accuracy percentage rounded to 2 decimal points
print("Naive Bayes accuracy (target + intent): {:.2f}%".format(correct / total * 100))
label_prob, pred_prob = labels_to_probs(test_labels, test_target_with_intent)
print("Hellinger distance: {:.2f}".format(hellinger_distance(label_prob, pred_prob)))

Naive Bayes accuracy: 29.29%
Hellinger distance: 0.00
Naive Bayes accuracy (test text only): 23.33%
Hellinger distance: 0.71
Naive Bayes accuracy (test intent only): 23.28%
Hellinger distance: 0.72
Naive Bayes accuracy (text + intent): 23.33%
Hellinger distance: 0.72
Naive Bayes accuracy (target): 23.37%
Hellinger distance: 0.71
Naive Bayes accuracy (target + intent): 23.28%
Hellinger distance: 0.72


# SVM

In [6]:
# Train a Linear SVM classifier
svm_clf = LinearSVC(max_iter=10000, random_state=42)  # You can adjust the max_iter and other parameters as needed
svm_clf.fit(train_features, train_labels_encoded)

calibrated_svc = CalibratedClassifierCV(svm_clf, cv=5)
calibrated_svc.fit(train_features, train_labels_encoded)

# Predict on validation set
val_predictions_encoded = svm_clf.predict(val_features)
calibrated_svc_predictions_encoded = calibrated_svc.predict(val_features)

test_predictions_encoded = svm_clf.predict(vectorizer.transform(test_texts))
test_predictions_encoded_intent = svm_clf.predict(vectorizer.transform(test_intents))
test_predictions_encoded_with_intent = svm_clf.predict(vectorizer.transform([f'{text} {intent}' for text, intent in zip(test_texts, test_intents)]))
test_target_pred_encoded = svm_clf.predict(vectorizer.transform(test_target))
test_target_with_intent_encoded = svm_clf.predict(vectorizer.transform([f'{text} {intent}' for text, intent in zip(test_target, test_intents)]))

# Decode the predictions back to original labels
val_predictions = label_encoder.inverse_transform(val_predictions_encoded)
calibrated_svc_predictions = label_encoder.inverse_transform(calibrated_svc_predictions_encoded)

test_predictions = label_encoder.inverse_transform(test_predictions_encoded)
test_predictions_intent = label_encoder.inverse_transform(test_predictions_encoded_intent)
test_predictions_with_intent = label_encoder.inverse_transform(test_predictions_encoded_with_intent)
test_target_pred = label_encoder.inverse_transform(test_target_pred_encoded)
test_target_with_intent = label_encoder.inverse_transform(test_target_with_intent_encoded)

total = 0
correct = 0
cali_total = 0
cali_correct = 0

for i in range(len(val_labels)):
    if val_labels[i] == val_predictions[i]:
        correct += 1
    if val_labels[i] == calibrated_svc_predictions[i]:
        cali_correct += 1
    total += 1
    cali_total += 1

print("SVM accuracy: {:.2f}%".format(correct / total * 100))
print("Calibrated SVM accuracy: {:.2f}%".format(cali_correct / cali_total * 100))
label_prob, pred_prob = labels_to_probs(val_labels, val_predictions)
print("Hellinger distance: {:.2f}".format(hellinger_distance(label_prob, pred_prob)))

total = 0
correct = 0
for i in range(len(test_labels)):
    if test_labels[i] == test_predictions[i]:
        correct += 1
    total += 1
# Print accuracy percentage rounded to 2 decimal points
print("SVM accuracy (test text only): {:.2f}%".format(correct / total * 100))
label_prob, pred_prob = labels_to_probs(test_labels, test_predictions)
print("Hellinger distance: {:.2f}".format(hellinger_distance(label_prob, pred_prob)))

total = 0
correct = 0
for i in range(len(test_labels)):
    if test_labels[i] == test_predictions_intent[i]:
        correct += 1
    total += 1
# Print accuracy percentage rounded to 2 decimal points
print("SVM accuracy (test intent only): {:.2f}%".format(correct / total * 100))
label_prob, pred_prob = labels_to_probs(test_labels, test_predictions_intent)
print("Hellinger distance: {:.2f}".format(hellinger_distance(label_prob, pred_prob)))

total = 0
correct = 0
for i in range(len(test_labels)):
    if test_labels[i] == test_predictions_with_intent[i]:
        correct += 1
    total += 1
# Print accuracy percentage rounded to 2 decimal points
print("SVM accuracy (text + intent): {:.2f}%".format(correct / total * 100))
label_prob, pred_prob = labels_to_probs(test_labels, test_predictions_with_intent)
print("Hellinger distance: {:.2f}".format(hellinger_distance(label_prob, pred_prob)))

total = 0
correct = 0
for i in range(len(test_labels)):
    if test_labels[i] == test_target_pred[i]:
        correct += 1
    total += 1
# Print accuracy percentage rounded to 2 decimal points
print("SVM accuracy (target): {:.2f}%".format(correct / total * 100))
label_prob, pred_prob = labels_to_probs(test_labels, test_target_pred)
print("Hellinger distance: {:.2f}".format(hellinger_distance(label_prob, pred_prob)))

total = 0
correct = 0
for i in range(len(test_labels)):
    if test_labels[i] == test_target_with_intent[i]:
        correct += 1
    total += 1
# Print accuracy percentage rounded to 2 decimal points
print("SVM accuracy (target + intent): {:.2f}%".format(correct / total * 100))
label_prob, pred_prob = labels_to_probs(test_labels, test_target_with_intent)
print("Hellinger distance: {:.2f}".format(hellinger_distance(label_prob, pred_prob)))

SVM accuracy: 85.25%
Calibrated SVM accuracy: 84.95%
Hellinger distance: 0.06
SVM accuracy (test text only): 22.22%
Hellinger distance: 0.29
SVM accuracy (test intent only): 42.93%
Hellinger distance: 0.26
SVM accuracy (text + intent): 25.68%
Hellinger distance: 0.29
SVM accuracy (target): 29.40%
Hellinger distance: 0.24
SVM accuracy (target + intent): 38.27%
Hellinger distance: 0.24


# Random Forest

In [7]:
# Train a Random Forest classifier
rf_clf = RandomForestClassifier(n_estimators=300, random_state=42)  # Adjust the n_estimators and other parameters as needed
rf_clf.fit(train_features, train_labels_encoded)

# Predict on validation set
val_predictions_encoded = rf_clf.predict(val_features)

test_predictions_encoded = rf_clf.predict(vectorizer.transform(test_texts))
test_predictions_encoded_intent = rf_clf.predict(vectorizer.transform(test_intents))
test_predictions_encoded_with_intent = rf_clf.predict(vectorizer.transform([f'{text} {intent}' for text, intent in zip(test_texts, test_intents)]))
test_target_pred_encoded = rf_clf.predict(vectorizer.transform(test_target))
test_target_with_intent_encoded = rf_clf.predict(vectorizer.transform([f'{text} {intent}' for text, intent in zip(test_target, test_intents)]))

# Decode the predictions back to original labels
val_predictions = label_encoder.inverse_transform(val_predictions_encoded)

test_predictions = label_encoder.inverse_transform(test_predictions_encoded)
test_predictions_intent = label_encoder.inverse_transform(test_predictions_encoded_intent)
test_predictions_with_intent = label_encoder.inverse_transform(test_predictions_encoded_with_intent)
test_target_pred = label_encoder.inverse_transform(test_target_pred_encoded)
test_target_with_intent = label_encoder.inverse_transform(test_target_with_intent_encoded)

total = 0
correct = 0
for i in range(len(val_labels)):
    if val_labels[i] == val_predictions[i]:
        correct += 1
    total += 1
print("Random Forest accuracy: {:.2f}%".format(correct / total * 100))
label_prob, pred_prob = labels_to_probs(val_labels, val_predictions)
print("Hellinger distance: {:.2f}".format(hellinger_distance(label_prob, pred_prob)))

total = 0
correct = 0
for i in range(len(test_labels)):
    if test_labels[i] == test_predictions[i]:
        correct += 1
    total += 1
# Print accuracy percentage rounded to 2 decimal points
print("RF accuracy (test text only): {:.2f}%".format(correct / total * 100))
label_prob, pred_prob = labels_to_probs(test_labels, test_predictions)
print("Hellinger distance: {:.2f}".format(hellinger_distance(label_prob, pred_prob)))

total = 0
correct = 0
for i in range(len(test_labels)):
    if test_labels[i] == test_predictions_intent[i]:
        correct += 1
    total += 1
# Print accuracy percentage rounded to 2 decimal points
print("RF accuracy (test intent only): {:.2f}%".format(correct / total * 100))
label_prob, pred_prob = labels_to_probs(test_labels, test_predictions_intent)
print("Hellinger distance: {:.2f}".format(hellinger_distance(label_prob, pred_prob)))

total = 0
correct = 0
for i in range(len(test_labels)):
    if test_labels[i] == test_predictions_with_intent[i]:
        correct += 1
    total += 1
# Print accuracy percentage rounded to 2 decimal points
print("RF accuracy (text + intent): {:.2f}%".format(correct / total * 100))
label_prob, pred_prob = labels_to_probs(test_labels, test_predictions_with_intent)
print("Hellinger distance: {:.2f}".format(hellinger_distance(label_prob, pred_prob)))

total = 0
correct = 0
for i in range(len(test_labels)):
    if test_labels[i] == test_target_pred[i]:
        correct += 1
    total += 1
# Print accuracy percentage rounded to 2 decimal points
print("RF accuracy (target): {:.2f}%".format(correct / total * 100))
label_prob, pred_prob = labels_to_probs(test_labels, test_target_pred)
print("Hellinger distance: {:.2f}".format(hellinger_distance(label_prob, pred_prob)))

total = 0
correct = 0
for i in range(len(test_labels)):
    if test_labels[i] == test_target_with_intent[i]:
        correct += 1
    total += 1
# Print accuracy percentage rounded to 2 decimal points
print("RF accuracy (target + intent): {:.2f}%".format(correct / total * 100))
label_prob, pred_prob = labels_to_probs(test_labels, test_target_with_intent)
print("Hellinger distance: {:.2f}".format(hellinger_distance(label_prob, pred_prob)))

Random Forest accuracy: 80.20%
Hellinger distance: 0.12
RF accuracy (test text only): 23.68%
Hellinger distance: 0.59
RF accuracy (test intent only): 41.02%
Hellinger distance: 0.44
RF accuracy (text + intent): 25.54%
Hellinger distance: 0.60
RF accuracy (target): 31.66%
Hellinger distance: 0.47
RF accuracy (target + intent): 37.69%
Hellinger distance: 0.45


In [8]:
def predict_label(sentence, vectorizer, classifier, label_encoder):
    # Transform the sentence using the trained TF-IDF vectorizer
    features = vectorizer.transform([sentence])
    
    # Predict the class using the trained classifier
    prediction_encoded = classifier.predict(features)
    
    # Decode the prediction to the original label
    prediction = label_encoder.inverse_transform(prediction_encoded)
    
    return prediction[0]

# returns a dictionary of the probabilities of each label
def predict_probability(sentence, vectorizer, classifier, label_encoder):
    # Transform the sentence using the trained TF-IDF vectorizer
    features = vectorizer.transform([sentence])
    
    # Predict the class probabilities using the trained classifier
    probabilities = classifier.predict_proba(features)
    
    # Convert the probabilities to a dictionary keyed by the label names
    label_probabilities = dict(zip(label_encoder.classes_, probabilities[0]))
    
    return label_probabilities


In [9]:
# Example usage:
sentence = "Also, everyone roll perception and stealth"
predicted_label = predict_label(sentence, vectorizer, svm_clf, label_encoder)
print(f"The predicted label for the sentence is: {predicted_label}")

prob_dist = predict_probability(sentence, vectorizer, calibrated_svc, label_encoder)
print("The probabilities of each label are:")
for label, prob in prob_dist.items():
    print(f"{label}: {prob:.2f}")

The predicted label for the sentence is: stealth
The probabilities of each label are:
acrobatics: 0.00
arcana: 0.00
athletics: 0.00
charisma: 0.00
deception: 0.00
dexterity: 0.00
hand: 0.00
handling: 0.00
history: 0.00
insight: 0.00
intelligence: 0.00
intimidation: 0.00
investigation: 0.00
medicine: 0.00
nature: 0.00
perception: 0.47
performance: 0.00
persuasion: 0.00
religion: 0.00
stealth: 0.52
strength: 0.00
survival: 0.00
wisdom: 0.00
