# 670 Final Project: D&D Spell School Classification
* Group members:
    * Yufeng Song (yfsong)
    * Lan Xu (lanxu)


In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

from sentence_transformers import SentenceTransformer

df = pd.read_csv('data/spells.csv')



In [2]:
df[["name", "desc", "higher_level", "material"]].isna().sum()

name              0
desc              0
higher_level    229
material        135
dtype: int64

In [3]:
text_cols = ["name", "desc", "higher_level", "material"]
for col in text_cols:
    df[col] = df[col].fillna("")

df["full_text"] = (
    df["name"] + ". " +
    df["desc"] + " " +
    df["higher_level"] + " " +
    df["material"]
)

In [4]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df["school_index"])

In [5]:
model = SentenceTransformer("all-MiniLM-L6-v2")
X_text_emb = model.encode(df["full_text"], show_progress_bar=True)

Batches:   0%|          | 0/10 [00:00<?, ?it/s]

In [6]:
X = X_text_emb

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

## Baselines

In [7]:
models = {
    # linear models
    "Logistic Regression": LogisticRegression(max_iter=3000),
    "Linear SVM": SVC(kernel="linear"),

    # kernel SVM family
    "RBF SVM": SVC(kernel="rbf", gamma="scale", C=1.0),
    "Polynomial SVM (degree=3)": SVC(kernel="poly", degree=3, C=1.0),

    # Trees (non-bagging and bagging variants)
    "Decision Tree": DecisionTreeClassifier(max_depth=None),
    "Extra Trees": ExtraTreesClassifier(n_estimators=300),
    "Gradient Boosting": GradientBoostingClassifier(),

    # Neighbors
    "KNN (k=5)": KNeighborsClassifier(n_neighbors=5),

    # Naive Bayes
    "Gaussian NB": GaussianNB(),

    # XGBoost baseline
    "XGBoost": XGBClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.9,
        colsample_bytree=0.9,
        eval_metric="mlogloss"
    )
}

for name, clf in models.items():
    print("=" * 50)
    print(f"Training Model: {name}")
    
    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    
    macro_precision = precision_score(y_test, preds, average="macro")
    macro_recall = recall_score(y_test, preds, average="macro")
    macro_f1 = f1_score(y_test, preds, average="macro")

    print(f"Macro Precision = {macro_precision:.4f}")
    print(f"Macro Recall    = {macro_recall:.4f}")
    print(f"Macro F1-score  = {macro_f1:.4f}")
    
    print("\nPer-class performance:")
    print(classification_report(y_test, preds, target_names=label_encoder.classes_))


Training Model: Logistic Regression
Macro Precision = 0.7000
Macro Recall    = 0.5198
Macro F1-score  = 0.5621

Per-class performance:
               precision    recall  f1-score   support

   abjuration       0.60      0.38      0.46         8
  conjuration       0.62      0.50      0.56        10
   divination       1.00      0.33      0.50         6
  enchantment       0.67      0.67      0.67         6
    evocation       0.33      0.58      0.42        12
     illusion       1.00      0.80      0.89         5
   necromancy       1.00      0.40      0.57         5
transmutation       0.38      0.50      0.43        12

     accuracy                           0.52        64
    macro avg       0.70      0.52      0.56        64
 weighted avg       0.62      0.52      0.53        64

Training Model: Linear SVM
Macro Precision = 0.6916
Macro Recall    = 0.5958
Macro F1-score  = 0.6108

Per-class performance:
               precision    recall  f1-score   support

   abjuration       

  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b


Macro Precision = 0.7356
Macro Recall    = 0.5583
Macro F1-score  = 0.5971

Per-class performance:
               precision    recall  f1-score   support

   abjuration       0.57      0.50      0.53         8
  conjuration       0.67      0.60      0.63        10
   divination       1.00      0.33      0.50         6
  enchantment       0.75      0.50      0.60         6
    evocation       0.43      0.83      0.57        12
     illusion       1.00      0.80      0.89         5
   necromancy       1.00      0.40      0.57         5
transmutation       0.46      0.50      0.48        12

     accuracy                           0.58        64
    macro avg       0.74      0.56      0.60        64
 weighted avg       0.66      0.58      0.58        64

Training Model: Gradient Boosting
Macro Precision = 0.3821
Macro Recall    = 0.3688
Macro F1-score  = 0.3675

Per-class performance:
               precision    recall  f1-score   support

   abjuration       0.33      0.25      0.29     

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Macro Precision = 0.4270
Macro Recall    = 0.4292
Macro F1-score  = 0.4255

Per-class performance:
               precision    recall  f1-score   support

   abjuration       0.25      0.25      0.25         8
  conjuration       0.31      0.40      0.35        10
   divination       0.00      0.00      0.00         6
  enchantment       0.80      0.67      0.73         6
    evocation       0.31      0.42      0.36        12
     illusion       0.60      0.60      0.60         5
   necromancy       0.60      0.60      0.60         5
transmutation       0.55      0.50      0.52        12

     accuracy                           0.42        64
    macro avg       0.43      0.43      0.43        64
 weighted avg       0.41      0.42      0.41        64



### Interpretation
#### Summary Table (Macro F1)
| Model | Macro F1 |
|--------|-----------|
| **RBF SVM** | **0.6458** |
| **KNN (k=5)** | **0.6168** |
| **Linear SVM** | **0.6108** |
| Logistic Regression | 0.5621 |
| Extra Trees | 0.5717 |
| Gaussian NB | 0.5730 |
| Polynomial SVM (deg=3) | 0.5357 |
| Gradient Boosting | 0.3702 |
| XGBoost | 0.4255 |
| Decision Tree | 0.3178 |

#### Key Insights
1. **Kernel SVMs outperform all other baselines**, with RBF SVM achieving the highest macro F1.
2. **KNN is surprisingly strong**, suggesting that spells cluster cleanly in embedding space.
3. **Linear SVM > Logistic Regression**, confirming mild linear separability.
4. **Tree-based models consistently underperform**; dense embeddings are not well suited for axis-aligned splits.
5. Certain schools, especially **Divination and Transmutation**, remain challenging due to semantic overlap.

#### What This Means
- The embedding space captures semantic structure, but *not all schools are equally separable*.
- Nonlinear decision boundaries (RBF SVM) help significantly.
- More advanced models must focus on better representing nuanced spell semantics.

#### Next Steps
1. **Tune the RBF SVM**  
   - Grid search `C` and `gamma`  
   - Expect significant improvement.

2. **Upgrade the embedding model**  
   - Try `all-mpnet-base-v2` or `bge-large-en-v1.5`
   - Strongly boosts class separation.

3. **Add structured features**  
   Combine embeddings with:
   - spell level  
   - casting time category  
   - duration category  

4. **Try a simple neural classifier**  
   - MLP on top of embeddings (1–2 layers)

5. **Analyze confusion matrix**  
   - Understand which schools consistently overlap (likely: Evocation/Transmutation/Conjuration).


In [8]:
### 1. Tune the RBF SVM
from sklearn.model_selection import GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.001, 0.01, 0.1, 1, 'scale', 'auto']
}

rbf_svm = SVC(kernel='rbf')

# Grid search with 5-fold CV
grid_search = GridSearchCV(
    estimator=rbf_svm,
    param_grid=param_grid,
    cv=5,
    scoring='f1_macro',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best CV macro F1:", grid_search.best_score_)

# Evaluate on test set
best_rbf_svm = grid_search.best_estimator_
preds = best_rbf_svm.predict(X_test)

macro_precision = precision_score(y_test, preds, average="macro")
macro_recall = recall_score(y_test, preds, average="macro")
macro_f1 = f1_score(y_test, preds, average="macro")

print(f"Test Macro Precision = {macro_precision:.4f}")
print(f"Test Macro Recall    = {macro_recall:.4f}")
print(f"Test Macro F1-score  = {macro_f1:.4f}")

print("\nPer-class performance:")
print(classification_report(y_test, preds, target_names=label_encoder.classes_))

Fitting 5 folds for each of 24 candidates, totalling 120 fits


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Best parameters: {'C': 10, 'gamma': 1}
Best CV macro F1: 0.6407292517069236
Test Macro Precision = 0.6771
Test Macro Recall    = 0.6219
Test Macro F1-score  = 0.6394

Per-class performance:
               precision    recall  f1-score   support

   abjuration       0.71      0.62      0.67         8
  conjuration       0.78      0.70      0.74        10
   divination       0.67      0.33      0.44         6
  enchantment       0.83      0.83      0.83         6
    evocation       0.50      0.58      0.54        12
     illusion       0.80      0.80      0.80         5
   necromancy       0.75      0.60      0.67         5
transmutation       0.38      0.50      0.43        12

     accuracy                           0.61        64
    macro avg       0.68      0.62      0.64        64
 weighted avg       0.64      0.61      0.61        64



In [None]:
### 2. Upgrade the embedding model —— all-mpnet-base-v2 & bge-large-en-v1.5

from sklearn.svm import SVC
from sklearn.metrics import f1_score, classification_report
import pandas as pd

embedding_models = {
    "all-mpnet-base-v2": 768,
    "BAAI/bge-large-en-v1.5": 1024
}

results = []

for model_name, expected_dim in embedding_models.items():
    print("\n" + "=" * 60)
    print(f"Loading & encoding with → {model_name}")
    print("=" * 60)

    model = SentenceTransformer(model_name)
    X_emb = model.encode(
        df["full_text"].tolist(),
        batch_size=32 if "bge-large" in model_name else 64,
        show_progress_bar=True,
        normalize_embeddings=True
    )

    print(f"Embedding shape: {X_emb.shape}")

    X_train_tmp, X_test_tmp, y_train_tmp, y_test_tmp = train_test_split(
        X_emb, y, test_size=0.2, random_state=42, stratify=y
    )

    svm = SVC(kernel='rbf', random_state=42)
    svm.fit(X_train_tmp, y_train_tmp)
    preds = svm.predict(X_test_tmp)

    macro_f1 = f1_score(y_test_tmp, preds, average='macro')

    results.append({
        "Model": model_name,
        "Dim": expected_dim,
        "Macro F1": macro_f1,
        "Accuracy": (preds == y_test_tmp).mean()
    })

    print(f"Macro F1 = {macro_f1:.4f}   |   Accuracy = {(preds == y_test_tmp).mean():.4f}")
    print("\nPer-class report:")
    print(classification_report(y_test_tmp, preds, target_names=label_encoder.classes_, digits=3))

print("SUMMARY: Embedding Model Comparison")
summary_df = pd.DataFrame(results)
summary_df = summary_df.sort_values("Macro F1", ascending=False).reset_index(drop=True)
summary_df.index += 1
print(summary_df.round(4).to_string(index=True))

# Save the current best embedding as X_upgraded (defaulting to the highest one from Macro F1).
best_model_name = summary_df.iloc[0]["Model"]
print(f"\nBest embedding model → {best_model_name}")

best_model = SentenceTransformer(best_model_name)
X_upgraded = best_model.encode(
    df["full_text"].tolist(),
    batch_size=32 if "bge-large" in best_model_name else 64,
    show_progress_bar=True,
    normalize_embeddings=True
)

X_train_up, X_test_up, y_train_up, y_test_up = train_test_split(
    X_upgraded, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nX_upgraded: {X_upgraded.shape[1]}")


Loading & encoding with → all-mpnet-base-v2


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
### 3. Add structured features

df['casting_time_cat'] = df['casting_time'].str.lower().str.extract(r'(\d+ \w+|\w+)')[0]
df['casting_time_cat'] = df['casting_time_cat'].fillna('unknown').astype('category')

df['duration_cat'] = df['duration'].str.lower()
df['duration_cat'] = df['duration_cat'].str.replace('concentration, up to ', 'concentration ')
df['duration_cat'] = df['duration_cat'].str.extract(r'(\w+ \w+|\w+)')[0]
df['duration_cat'] = df['duration_cat'].fillna('unknown').astype('category')

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
struct_features = encoder.fit_transform(df[['casting_time_cat', 'duration_cat']])

scaler = StandardScaler()
level_scaled = scaler.fit_transform(df[['level']])

X_struct = np.hstack([level_scaled, struct_features])

X_combined = np.hstack([X_upgraded, X_struct])

X_train_comb, X_test_comb, y_train_comb, y_test_comb = train_test_split(
    X_combined, y, test_size=0.2, random_state=42, stratify=y
)

# Test the improvement on RBF SVM
rbf_svm_comb = SVC(kernel='rbf')
rbf_svm_comb.fit(X_train_comb, y_train_comb)
preds_comb = rbf_svm_comb.predict(X_test_comb)

macro_f1_comb = f1_score(y_test_comb, preds_comb, average="macro")
print(f"Combined Features Macro F1 (RBF SVM): {macro_f1_comb:.4f}")

print("\nPer-class performance:")
print(classification_report(y_test_comb, preds_comb, target_names=label_encoder.classes_))

In [None]:
### 4. Try a simple neural classifier
from sklearn.neural_network import MLPClassifier

# 定义 MLP：2 层隐藏，100 和 50 神经元
mlp = MLPClassifier(
    hidden_layer_sizes=(100, 50),
    activation='relu',
    solver='adam',
    max_iter=1000,
    random_state=42
)

mlp.fit(X_train_comb, y_train_comb)
preds_mlp = mlp.predict(X_test_comb)

macro_f1_mlp = f1_score(y_test_comb, preds_mlp, average="macro")
print(f"MLP Macro F1: {macro_f1_mlp:.4f}")

print("\nPer-class performance:")
print(classification_report(y_test_comb, preds_mlp, target_names=label_encoder.classes_))

In [None]:
### 5. Analyze confusion matrix

import seaborn as sns
import matplotlib.pyplot as plt
cm = confusion_matrix(y_test_comb, preds_mlp)

plt.figure(figsize=(10, 8))
sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=label_encoder.classes_,
    yticklabels=label_encoder.classes_
)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

print("\nTop misclassifications:")
for i in range(len(label_encoder.classes_)):
    for j in range(len(label_encoder.classes_)):
        if i != j and cm[i, j] > 0:
            print(f"{label_encoder.classes_[i]} misclassified as {label_encoder.classes_[j]}: {cm[i, j]} times")

overlap_schools = ['evocation', 'transmutation', 'conjuration']
overlap_indices = [np.where(label_encoder.classes_ == school)[0][0] for school in overlap_schools]

print("\nOverlap among Evocation/Transmutation/Conjuration:")
sub_cm = cm[np.ix_(overlap_indices, overlap_indices)]
sns.heatmap(
    sub_cm,
    annot=True,
    fmt='d',
    cmap='Reds',
    xticklabels=overlap_schools,
    yticklabels=overlap_schools
)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Subset Confusion Matrix')
plt.show()