# 670 Final Project: D&D Spell School Classification
* Group members:
    * Yufeng Song (yfsong)
    * Lan Xu (lanxu)


In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

from sentence_transformers import SentenceTransformer

df = pd.read_csv('data/spells.csv')

In [2]:
df[["name", "desc", "higher_level", "material"]].isna().sum()

name              0
desc              0
higher_level    229
material        135
dtype: int64

In [3]:
text_cols = ["name", "desc", "higher_level", "material"]
for col in text_cols:
    df[col] = df[col].fillna("")

df["full_text"] = (
    df["name"] + ". " +
    df["desc"] + " " +
    df["higher_level"] + " " +
    df["material"]
)

In [4]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df["school_index"])

In [5]:
model = SentenceTransformer("all-MiniLM-L6-v2")
X_text_emb = model.encode(df["full_text"], show_progress_bar=True)



Batches:   0%|          | 0/10 [00:00<?, ?it/s]

In [6]:
X = X_text_emb

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

## Baselines

In [7]:
models = {
    # linear models
    "Logistic Regression": LogisticRegression(max_iter=3000),
    "Linear SVM": SVC(kernel="linear"),

    # kernel SVM family
    "RBF SVM": SVC(kernel="rbf", gamma="scale", C=1.0),
    "Polynomial SVM (degree=3)": SVC(kernel="poly", degree=3, C=1.0),

    # Trees (non-bagging and bagging variants)
    "Decision Tree": DecisionTreeClassifier(max_depth=None),
    "Extra Trees": ExtraTreesClassifier(n_estimators=300),
    "Gradient Boosting": GradientBoostingClassifier(),

    # Neighbors
    "KNN (k=5)": KNeighborsClassifier(n_neighbors=5),

    # Naive Bayes
    "Gaussian NB": GaussianNB(),

    # XGBoost baseline
    "XGBoost": XGBClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.9,
        colsample_bytree=0.9,
        eval_metric="mlogloss"
    )
}

for name, clf in models.items():
    print("=" * 50)
    print(f"Training Model: {name}")
    
    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    
    macro_precision = precision_score(y_test, preds, average="macro")
    macro_recall = recall_score(y_test, preds, average="macro")
    macro_f1 = f1_score(y_test, preds, average="macro")

    print(f"Macro Precision = {macro_precision:.4f}")
    print(f"Macro Recall    = {macro_recall:.4f}")
    print(f"Macro F1-score  = {macro_f1:.4f}")
    
    print("\nPer-class performance:")
    print(classification_report(y_test, preds, target_names=label_encoder.classes_))


Training Model: Logistic Regression
Macro Precision = 0.7000
Macro Recall    = 0.5198
Macro F1-score  = 0.5621

Per-class performance:
               precision    recall  f1-score   support

   abjuration       0.60      0.38      0.46         8
  conjuration       0.62      0.50      0.56        10
   divination       1.00      0.33      0.50         6
  enchantment       0.67      0.67      0.67         6
    evocation       0.33      0.58      0.42        12
     illusion       1.00      0.80      0.89         5
   necromancy       1.00      0.40      0.57         5
transmutation       0.38      0.50      0.43        12

     accuracy                           0.52        64
    macro avg       0.70      0.52      0.56        64
 weighted avg       0.62      0.52      0.53        64

Training Model: Linear SVM
Macro Precision = 0.6916
Macro Recall    = 0.5958
Macro F1-score  = 0.6108

Per-class performance:
               precision    recall  f1-score   support

   abjuration       

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Macro Precision = 0.4270
Macro Recall    = 0.4292
Macro F1-score  = 0.4255

Per-class performance:
               precision    recall  f1-score   support

   abjuration       0.25      0.25      0.25         8
  conjuration       0.31      0.40      0.35        10
   divination       0.00      0.00      0.00         6
  enchantment       0.80      0.67      0.73         6
    evocation       0.31      0.42      0.36        12
     illusion       0.60      0.60      0.60         5
   necromancy       0.60      0.60      0.60         5
transmutation       0.55      0.50      0.52        12

     accuracy                           0.42        64
    macro avg       0.43      0.43      0.43        64
 weighted avg       0.41      0.42      0.41        64



### Interpretation
#### Summary Table (Macro F1)
| Model | Macro F1 |
|--------|-----------|
| **RBF SVM** | **0.6458** |
| **KNN (k=5)** | **0.6168** |
| **Linear SVM** | **0.6108** |
| Logistic Regression | 0.5621 |
| Extra Trees | 0.5717 |
| Gaussian NB | 0.5730 |
| Polynomial SVM (deg=3) | 0.5357 |
| Gradient Boosting | 0.3702 |
| XGBoost | 0.4255 |
| Decision Tree | 0.3178 |

#### Key Insights
1. **Kernel SVMs outperform all other baselines**, with RBF SVM achieving the highest macro F1.
2. **KNN is surprisingly strong**, suggesting that spells cluster cleanly in embedding space.
3. **Linear SVM > Logistic Regression**, confirming mild linear separability.
4. **Tree-based models consistently underperform**; dense embeddings are not well suited for axis-aligned splits.
5. Certain schools, especially **Divination and Transmutation**, remain challenging due to semantic overlap.

#### What This Means
- The embedding space captures semantic structure, but *not all schools are equally separable*.
- Nonlinear decision boundaries (RBF SVM) help significantly.
- More advanced models must focus on better representing nuanced spell semantics.

#### Next Steps
1. **Tune the RBF SVM**  
   - Grid search `C` and `gamma`  
   - Expect significant improvement.

2. **Upgrade the embedding model**  
   - Try `all-mpnet-base-v2` or `bge-large-en-v1.5`
   - Strongly boosts class separation.

3. **Add structured features**  
   Combine embeddings with:
   - spell level  
   - casting time category  
   - duration category  

4. **Try a simple neural classifier**  
   - MLP on top of embeddings (1â€“2 layers)

5. **Analyze confusion matrix**  
   - Understand which schools consistently overlap (likely: Evocation/Transmutation/Conjuration).
