# Qualitatitive topic evaluations
This notebook is used for formatting the data for qualitative evaluations. This includes: 
- Inspecting the topic words
- Inspecting representative documents
- Coming up with good "titles" for the topics

In [16]:
import pickle
import numpy as np
import pandas as pd
import re
from pprint import pprint
from typing import Dict, Tuple, List, Union
from bertopic import BERTopic
from pathlib import Path

In [14]:
def read_pickle(file_path):
    with open(file_path, "rb") as f:
        return pickle.load(f)

def get_model_name(model_path: Path) -> str:
    return re.match("\w+-\w+-\d+", model_path.name).group()

def create_topic_dict(raw_topic_dict: Dict[int, Tuple[str, int]]) -> Dict[int, List[str]]:
    return {k: [tup[0] for tup in lst] for k, lst in raw_topic_dict.items() if k!=-1}
    

In [6]:
DATA_DIR = Path("../data")
MODEL_DIR = Path("../../ExplainlpTwitter/output")

model_path = next(MODEL_DIR.glob("*topic_model*"))
model = BERTopic.load(model_path)

True


In [8]:
doc_topics = pd.read_csv(MODEL_DIR / "doc_topics.csv")
doc_topics

Unnamed: 0,topic,prob,doc
0,-1,0.000000,wants to compete! i want hard competition! i w...
1,0,0.960256,It seems we are stuck on the ground in Amarill...
2,-1,0.000000,where the f are my pinking shears? rarararrrar...
3,-1,0.000000,0ff t0 tHE MEEtiN.. i HAtE WhEN PPl V0lUNtEER...
4,3,0.794290,@ reply me pls
...,...,...,...
99994,2,1.000000,@user Kind of...I am really annoyed I missed t...
99995,-1,0.000000,So I went to Bahama Bucks today and the cute s...
99996,-1,0.000000,Cold one moment. Sweating the other. Feeling s...
99997,-1,0.000000,The kids are too cute for my own good...


## Looking at topic words

In [17]:
topic_dict = create_topic_dict(model.get_topics())
pprint(topic_dict)


{0: ['dont',
     'http',
     'know',
     'just',
     'like',
     'lol',
     'http user',
     'im',
     'think',
     'twitter'],
 1: ['going',
     'day',
     'today',
     'got',
     'bed',
     'tomorrow',
     'just',
     'night',
     'time',
     'good'],
 2: ['im',
     'good',
     'day',
     'going',
     'sad',
     'today',
     'got',
     'like',
     'just',
     'work'],
 3: ['user thanks',
     'thanks',
     'miss',
     'thank',
     'follow',
     'http',
     'welcome',
     'hey',
     'http user',
     'know'],
 4: ['im',
     'lol',
     'good',
     'haha',
     'like',
     'just',
     'better',
     'day',
     'dont',
     'going'],
 5: ['love',
     'good',
     'fun',
     'day',
     'great',
     'im',
     'work',
     'nice',
     'time',
     'like'],
 6: ['work',
     'hate',
     'want',
     'im',
     'today',
     'dont',
     'sick',
     'day',
     'really',
     'school'],
 7: ['good',
     'good morning',
     'morning',
     'day

In [21]:
doc_topics.groupby("topic").count()

Unnamed: 0_level_0,prob,doc
topic,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,75824,75824
0,6600,6600
1,3372,3372
2,3166,3166
3,3020,3020
4,2541,2541
5,1407,1407
6,1308,1308
7,940,940
8,936,936


In [27]:
for top, words in topic_dict.items():
    print(f"evaluating topic {top}")
    pprint(f"{words = }")
    print(f"examples for topic {top}")
    example_tweets = doc_topics.loc[doc_topics["topic"] == top, "doc"].sample(4, random_state=42).tolist()
    print(example_tweets)
    print("")


evaluating topic 0
("words = ['dont', 'http', 'know', 'just', 'like', 'lol', 'http user', 'im', "
 "'think', 'twitter']")
examples for topic 0
["Y&amp;R Daniel Goddard (Cane) and Joshua Morrow (Nick) are on 'The Price is Right' for the Showcase part...if anyone is home ", '@user Never  its on my bebo ', '@user Check out my last Tweet, I know the feeling. ', "@user haven't seen your friend request yet "]

evaluating topic 1
("words = ['going', 'day', 'today', 'got', 'bed', 'tomorrow', 'just', 'night', "
 "'time', 'good']")
examples for topic 1
['going places in toothache agony ', 'A shelf in the fridge collapsed spilling salad dressing all over the inside. ', 'oh well, better slip now... gnytz everyone ', "had an AWESOME weekend with my family... now it's monday and we start ALL over "]

evaluating topic 2
("words = ['im', 'good', 'day', 'going', 'sad', 'today', 'got', 'like', "
 "'just', 'work']")
examples for topic 2
["@user  I wish I lived in Vegas, but not for the weather. It's be s