# Qualitatitive topic evaluations
This notebook is used for formatting the data for qualitative evaluations. This includes: 
- Inspecting the topic words
- Inspecting representative documents
- Coming up with good "titles" for the topics

In [1]:
import pickle
import numpy as np
import pandas as pd
import re
from pprint import pprint
from typing import Dict, Tuple, List, Union
from bertopic import BERTopic
from pathlib import Path

C:\Users\jhr\Anaconda3\envs\bertopic_explore\lib\site-packages\numpy\.libs\libopenblas.4SP5SUA7CBGXUEOC35YP2ASOICYYEQZZ.gfortran-win_amd64.dll
C:\Users\jhr\Anaconda3\envs\bertopic_explore\lib\site-packages\numpy\.libs\libopenblas.GK7GX5KEQ4F6UYO3P26ULGBQYHGQO7J4.gfortran-win_amd64.dll


In [2]:
def read_pickle(file_path):
    with open(file_path, "rb") as f:
        return pickle.load(f)

def get_model_name(model_path: Path) -> str:
    return re.match("\w+-\w+-\d+", model_path.name).group()

def create_topic_dict(raw_topic_dict: Dict[int, Tuple[str, int]]) -> Dict[int, List[str]]:
    return {k: [tup[0] for tup in lst] for k, lst in raw_topic_dict.items() if k!=-1}
    

In [3]:
DATA_DIR = Path("../data")
MODEL_DIR = Path("../../ExplainlpTwitter/output")


In [None]:
model_path = ""
model = BERTopic.load(model_path)

In [25]:

doc_topics = pd.read_csv(next(DATA_DIR.glob("*full_doc_topics_*.csv")), index_col=0)
doc_topics
topic_words = read_pickle(DATA_DIR / "topic_words.pkl" )
clean_tweets = pd.read_csv(MODEL_DIR / "clean_tweets.csv", usecols=["cleantext"])
doc_topics = doc_topics.merge(clean_tweets, left_index=True, right_index=True)


## Looking at topic words

In [7]:
topic_dict = create_topic_dict(topic_words)
pprint(topic_dict)


{0: ['user',
     'im',
     'just',
     'good',
     'day',
     'like',
     'http',
     'love',
     'dont',
     'today'],
 1: ['user',
     'im',
     'http',
     'just',
     'got',
     'music',
     'new',
     'going',
     'ill',
     'amp'],
 2: ['going',
     'http',
     'user',
     'just',
     'later',
     'house',
     'day',
     'lunch',
     'amp',
     'im'],
 3: ['user',
     'im',
     'haha',
     'just',
     'think',
     'time',
     'dont',
     'ha',
     'lol',
     'later'],
 4: ['user',
     'didnt',
     'did',
     'http',
     'think',
     'just',
     'know',
     'twitter',
     'like',
     'heerens'],
 5: ['user',
     'june',
     '2nd',
     'user buy',
     'tonight',
     'july user',
     'today',
     '35',
     'user friend',
     'downloaded'],
 6: ['user user', 'user', '', '', '', '', '', '', '', ''],
 7: ['user',
     'kozha',
     'kill',
     'think',
     'im',
     '1999',
     'ir',
     'days',
     'user cuz',
     'melt'],
 

In [13]:
doc_topics.groupby("topic").size()

topic
-1     410345
 0    1161695
 1       6137
 2       5765
 3       4367
 4       1348
 5       3596
 6       2761
 7       2549
 8        841
 9        596
dtype: int64

In [8]:
doc_topics.groupby("topic").count()

Unnamed: 0_level_0,prob
topic,Unnamed: 1_level_1
-1,0
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0


In [15]:

topic_dict.keys()

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [27]:
for top, words in topic_dict.items():
    print(f"evaluating topic {top}")
    pprint(f"{words = }")
    print(f"examples for topic {top}")
    example_tweets = doc_topics.loc[doc_topics["topic"] == top, "cleantext"].sample(10, random_state=42).tolist()
    print(example_tweets)
    print("")


evaluating topic 0
("words = ['user', 'im', 'just', 'good', 'day', 'like', 'http', 'love', "
 "'dont', 'today']")
examples for topic 0
['http - fucked up eye ', '@user awesome! ', 'no more blonde hair ', "@user I saw ur pics on FB. Y'all are so cute!  We will ride bikes in Chi. ", "@user haha, dj's has too much classy stuff cheza! i wanna buy everything there ", 'arrived safely in manila. its nice to be hooome ', "@user YES. And that's why I think yall're awesome. ", 'The Colours of Southern Africa: the Wildlife Photography of Hannes Lochner http Enjoy and have a fab day  ;)', 'the rain is nice. ', "man my ankles so effed up. I took a nap and it stopped hurting im scared it'll start hurting again when I walk around "]

evaluating topic 1
("words = ['user', 'im', 'http', 'just', 'got', 'music', 'new', 'going', "
 "'ill', 'amp']")
examples for topic 1
['@user well the molecular twitter party is I believe open to everyone into deleuzian concepts to join in ', '@user well, we have a whole 