# Qualitatitive topic evaluations
This notebook is used for formatting the data for qualitative evaluations. This includes: 
- Inspecting the topic words
- Inspecting representative documents
- Coming up with good "titles" for the topics

Notes on TweetEval: 
- 10 topics may be too much :((
- Difficult to classify? 

In [None]:
import pickle
import numpy as np
import pandas as pd
import re
from pprint import pprint
from typing import Dict, Tuple, List, Union
from bertopic import BERTopic
from pathlib import Path

In [None]:
def read_pickle(file_path):
    with open(file_path, "rb") as f:
        return pickle.load(f)

def get_model_name(model_path: Path) -> str:
    return re.match("\w+-\w+-\d+", model_path.name).group()

def create_topic_dict(raw_topic_dict: Dict[int, Tuple[str, int]]) -> Dict[int, List[str]]:
    return {k: [tup[0] for tup in lst] for k, lst in raw_topic_dict.items() if k!=-1}

def latest_full_topics(dr: Path) -> Path:
    return list(dr.glob("*full_doc_topics_*.csv"))[-1]

    

In [None]:
DATA_DIR = Path("../data")
MODEL_DIR = Path("../../ExplainlpTwitter/output")


In [None]:
model_path = ""
model = BERTopic.load(model_path)

In [None]:

doc_topics = pd.read_csv(latest_full_topics(DATA_DIR), index_col=0)
doc_topics
topic_words = read_pickle(DATA_DIR / "tweeteval_topic_dict.pkl" )
clean_tweets = pd.read_csv(DATA_DIR / "tweeteval_text.csv", usecols=["text"])
doc_topics = doc_topics.merge(clean_tweets, left_index=True, right_index=True)


## Looking at topic words

In [None]:
topic_dict = create_topic_dict(topic_words)
pprint(topic_dict)


In [None]:
doc_topics.groupby("topic").size()

In [None]:
for top, words in topic_dict.items():
    print(f"evaluating topic {top}")
    pprint(f"{words = }")
    print(f"examples for topic {top}")
    example_tweets = doc_topics.loc[doc_topics["topic"] == top, "text"].sample(10, random_state=42).tolist()
    print(example_tweets)
    print("")
