In [15]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV

In [16]:
# Load the jsonl data
def load_jsonl(input_path):
    with open(input_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        data = [json.loads(line.strip()) for line in lines]
    return data

# Assume each JSON object has a 'text' field and a 'label' field
train_data = load_jsonl('data/train.jsonl')
val_data = load_jsonl('data/valid.jsonl')

train_texts = [item['input'] for item in train_data]
train_labels = [item['target'] for item in train_data]
val_texts = [item['input'] for item in val_data]
val_labels = [item['target'] for item in val_data]

In [17]:
# Encode labels from strings to integers
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
val_labels_encoded = label_encoder.transform(val_labels)

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Adjust the max_features and other parameters as needed

# Fit the vectorizer on training data and transform training and validation texts
train_features = vectorizer.fit_transform(train_texts)
val_features = vectorizer.transform(val_texts)

# Logistic Regression

In [18]:
# Train a Logistic Regression classifier
lr_clf = LogisticRegression(max_iter=10000)  # Adjust the max_iter and other parameters as needed
lr_clf.fit(train_features, train_labels_encoded)

# Predict on validation set
val_predictions_encoded = lr_clf.predict(val_features)

# Decode the predictions back to original labels
val_predictions = label_encoder.inverse_transform(val_predictions_encoded)

total = 0
correct = 0
for i in range(len(val_labels)):
    if val_labels[i] == val_predictions[i]:
        correct += 1
    total += 1
# Print accuracy percentage rounded to 2 decimal points
print("Logistic regression accuracy: {:.2f}%".format(correct / total * 100))

Logistic regression accuracy: 45.86%


# Naive Bayes

In [19]:
# Train a Multinomial Naive Bayes classifier
nb_clf = MultinomialNB()
nb_clf.fit(train_features, train_labels_encoded)

# Predict on validation set
val_predictions_encoded = nb_clf.predict(val_features)

# Decode the predictions back to original labels
val_predictions = label_encoder.inverse_transform(val_predictions_encoded)

total = 0
correct = 0
for i in range(len(val_labels)):
    if val_labels[i] == val_predictions[i]:
        correct += 1
    total += 1
print("Naive Bayes accuracy: {:.2f}%".format(correct / total * 100))

Naive Bayes accuracy: 28.89%


# SVM

In [20]:
# Train a Linear SVM classifier
svm_clf = LinearSVC(max_iter=10000, random_state=42)  # You can adjust the max_iter and other parameters as needed
svm_clf.fit(train_features, train_labels_encoded)

calibrated_svc = CalibratedClassifierCV(svm_clf, cv=5)
calibrated_svc.fit(train_features, train_labels_encoded)

# Predict on validation set
val_predictions_encoded = svm_clf.predict(val_features)
calibrated_svc_predictions_encoded = calibrated_svc.predict(val_features)

# Decode the predictions back to original labels
val_predictions = label_encoder.inverse_transform(val_predictions_encoded)
calibrated_svc_predictions = label_encoder.inverse_transform(calibrated_svc_predictions_encoded)

total = 0
correct = 0
cali_total = 0
cali_correct = 0

for i in range(len(val_labels)):
    if val_labels[i] == val_predictions[i]:
        correct += 1
    if val_labels[i] == calibrated_svc_predictions[i]:
        cali_correct += 1
    total += 1
    cali_total += 1

print("SVM accuracy: {:.2f}%".format(correct / total * 100))
print("Calibrated SVM accuracy: {:.2f}%".format(cali_correct / cali_total * 100))

SVM accuracy: 84.65%
Calibrated SVM accuracy: 83.64%


# Random Forest

In [21]:
# Train a Random Forest classifier
rf_clf = RandomForestClassifier(n_estimators=300, random_state=42)  # Adjust the n_estimators and other parameters as needed
rf_clf.fit(train_features, train_labels_encoded)

# Predict on validation set
val_predictions_encoded = rf_clf.predict(val_features)

# Decode the predictions back to original labels
val_predictions = label_encoder.inverse_transform(val_predictions_encoded)

total = 0
correct = 0
for i in range(len(val_labels)):
    if val_labels[i] == val_predictions[i]:
        correct += 1
    total += 1
print("Random Forest accuracy: {:.2f}%".format(correct / total * 100))

Random Forest accuracy: 78.99%


In [22]:
def predict_label(sentence, vectorizer, classifier, label_encoder):
    # Transform the sentence using the trained TF-IDF vectorizer
    features = vectorizer.transform([sentence])
    
    # Predict the class using the trained classifier
    prediction_encoded = classifier.predict(features)
    
    # Decode the prediction to the original label
    prediction = label_encoder.inverse_transform(prediction_encoded)
    
    return prediction[0]

# returns a dictionary of the probabilities of each label
def predict_probability(sentence, vectorizer, classifier, label_encoder):
    # Transform the sentence using the trained TF-IDF vectorizer
    features = vectorizer.transform([sentence])
    
    # Predict the class probabilities using the trained classifier
    probabilities = classifier.predict_proba(features)
    
    # Convert the probabilities to a dictionary keyed by the label names
    label_probabilities = dict(zip(label_encoder.classes_, probabilities[0]))
    
    return label_probabilities


In [24]:
# Example usage:
sentence = "Also, everyone roll perception and stealth"
predicted_label = predict_label(sentence, vectorizer, svm_clf, label_encoder)
print(f"The predicted label for the sentence is: {predicted_label}")

prob_dist = predict_probability(sentence, vectorizer, calibrated_svc, label_encoder)
print("The probabilities of each label are:")
for label, prob in prob_dist.items():
    print(f"{label}: {prob:.2f}")

The predicted label for the sentence is: stealth
The probabilities of each label are:
acrobatics: 0.00
arcana: 0.00
athletics: 0.00
charisma: 0.00
deception: 0.00
dexterity: 0.00
hand: 0.00
handling: 0.00
history: 0.00
insight: 0.00
intelligence: 0.00
intimidation: 0.00
investigation: 0.00
medicine: 0.00
nature: 0.00
perception: 0.47
performance: 0.00
persuasion: 0.00
religion: 0.00
stealth: 0.52
strength: 0.00
survival: 0.00
wisdom: 0.00
