In [1]:
import spacy
import json
import pandas as pd
from collections import defaultdict


In [4]:

nlp = spacy.load("en_core_web_sm")


logical_connectors = {
    "because", "so", "therefore", "however", "thus", "but", "although",
    "though", "moreover", "meanwhile", "consequently", "nevertheless",
    "since", "as", "nonetheless", "then", "hence"
}
symbol_tokens = {"Step", ":", ".", ",", "(", ")", "[", "]", "{", "}", "<", ">", "!", "?"}
symbol_tokens.update({str(i) for i in range(10)})


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:

with open("/content/drive/MyDrive/Cluster-proj/output/DeepSeek-7B-sample71-80_with_all_steps.json", "r") as f:
    data = json.load(f)

In [10]:

def analyze_token_clusters(step_data):
    sentence = " ".join([t["token"] for t in step_data])
    # print(sentence)
    doc = nlp(sentence)

    cluster_map = defaultdict(list)
    original_tokens = [t["token"] for t in step_data]
    original_probs = [t["prob"] for t in step_data]

    token_idx = 0
    for token in doc:
        if token_idx >= len(original_tokens):
            break
        if token.text != original_tokens[token_idx]:
            token_idx += 1
            continue
        prob = original_probs[token_idx]
        token_lower = token.text.lower()
        if token.text in symbol_tokens:
            cluster = "Symbol"
        elif token_lower in logical_connectors:
            cluster = "LogicalConnector"
        elif token.ent_type_:
            cluster = f"Entity:{token.ent_type_}"
        elif token.dep_:
            cluster = f"Syntactic:{token.dep_}"
        else:
            cluster = "Other"
        cluster_map[cluster].append({"token": token.text, "prob": prob})
        token_idx += 1

    cluster_confidence = {
        cluster: {
            "total_prob": sum(t["prob"] for t in tokens),
            "avg_prob": sum(t["prob"] for t in tokens) / len(tokens),
            "tokens": tokens,
            "top_token": max(tokens, key=lambda x: x["prob"])
        }
        for cluster, tokens in cluster_map.items()
    }
    return cluster_confidence



In [15]:
def analyze_all_steps_cluster(data, example_id, mode="sampling_step_token_probs"):
    all_step_results = []
    for step_id, step_data in data[example_id][mode].items():
        cluster_result = analyze_token_clusters(step_data)
        for cluster, info in cluster_result.items():
            all_step_results.append({
                "Step": step_id,
                "Cluster": cluster,
                "Total_Prob": info["total_prob"],
                "Avg_Prob": info["avg_prob"],
                "Top_Token": info["top_token"]["token"],
                "Top_Prob": info["top_token"]["prob"],
                "Token_Count": len(info["tokens"])
            })

    return pd.DataFrame(all_step_results)


In [20]:
data.keys()

dict_keys(['5abf037a5542993fe9a41dbe', '5ab3ed12554299753aec59f3', '5abcfc3b554299114383a1ad', '5a74f2ff5542993748c89748', '5ac52b495542994611c8b3de', '5a72b1c25542992359bc3172', '5a72a00d5542991f9a20c53c', '5a8fa73e5542992414482b22', '5a8dfbeb5542995085b3736e', '5ab39701554299233954ff5e'])

In [25]:
example_id = "5a74f2ff5542993748c89748"
df = analyze_all_steps_cluster(data, example_id, mode="sampling_step_token_probs")
df_sorted = df.sort_values(by=["Step", "Total_Prob"], ascending=[True, False])


: Identify the relevant information about George Washington University Hospital . It is located in Washington , D .C ., as stated in its context .
: Look at Med Star Washington Hospital Center 's context , which also mentions that it is located in Washington , D .C .
: Confirm that both institutions are situated in the same city , Washington , D .C ., based on their respective contexts . Answer : Both the George Washington University Hospital and Med Star Washington Hospital Center are located in Washington , D .C . <｜end▁of▁sentence｜>


In [26]:
df_sorted

Unnamed: 0,Step,Cluster,Total_Prob,Avg_Prob,Top_Token,Top_Prob,Token_Count
6,1,Entity:ORG,3.823491,0.955873,Washington,1.0,4
0,1,Symbol,2.170114,0.723371,:,1.0,3
1,1,Syntactic:ROOT,1.722987,0.861493,located,0.945687,2
5,1,Syntactic:prep,1.704857,0.852429,in,1.0,2
2,1,Syntactic:det,1.0,1.0,the,1.0,1
4,1,Syntactic:dobj,1.0,1.0,information,1.0,1
8,1,Syntactic:auxpass,1.0,1.0,is,1.0,1
9,1,Entity:GPE,1.0,1.0,Washington,1.0,1
10,1,Syntactic:compound,1.0,1.0,D,1.0,1
11,1,Syntactic:punct,1.0,1.0,.C,1.0,1


In [27]:
df_sorted[df_sorted["Cluster"].str.startswith("Entity")]


Unnamed: 0,Step,Cluster,Total_Prob,Avg_Prob,Top_Token,Top_Prob,Token_Count
6,1,Entity:ORG,3.823491,0.955873,Washington,1.0,4
9,1,Entity:GPE,1.0,1.0,Washington,1.0,1
15,2,Entity:ORG,5.133513,0.855585,Star,1.0,6
24,2,Entity:GPE,1.0,1.0,Washington,1.0,1
37,3,Entity:GPE,0.872099,0.872099,Washington,0.872099,1
