In [None]:
import json
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

In [None]:
with open('syntax.json', 'r', encoding='utf-8') as f:
    data = json.load(f) 

In [None]:
llm_data = []
cfg_data = []
for item in data:
    if item['source'] == 'llm':
        llm_data.append(item)
    if item['source'] == 'cfg':
        cfg_data.append(item)

In [None]:
def visualize_vectors(data, e_vectors=True):
    X_sample = []
    labels = []

    for item in data:
        if e_vectors:
            X_sample.append(item['best_vectors'][0])
        else:
            X_sample.append(item['best_vectors'][1])
        labels.append(item['category'])
    X_sample = np.array(X_sample)

    random_state = 42
    X_tsne = TSNE(n_components=2, perplexity=15, metric='cosine', random_state=random_state).fit_transform(X_sample)

    plt.figure(figsize=(10, 8))
    unique_labels = sorted(set(labels))
    label_to_color = {label: i for i, label in enumerate(unique_labels)}
    for label in unique_labels:
        idx = [i for i, l in enumerate(labels) if l == label]
        plt.scatter(X_tsne[idx, 0], X_tsne[idx, 1], label=label, s=40, edgecolors='k')
    title = 't-SNE of e-token vectors' if e_vectors else 't-SNE of m-token vectors'
    plt.title(title)
    plt.legend()
    plt.show()

In [None]:
visualize_vectors(llm_data, True)

In [None]:
visualize_vectors(llm_data, False)

In [None]:
visualize_vectors(cfg_data, True)

In [None]:
visualize_vectors(cfg_data, False)

In [None]:
visualize_vectors(data, True)

In [None]:
visualize_vectors(data, False)