# Qualitatitive topic evaluations
This notebook is used for formatting the data for qualitative evaluations. This includes: 
- Inspecting the topic words
- Inspecting representative documents
- Coming up with good "titles" for the topics

Notes on TweetEval: 
- 10 topics may be too much :((
- Difficult to classify? 

In [None]:
import pickle
import numpy as np
import pandas as pd
import re
from pprint import pprint
from typing import Dict, Tuple, List, Union
from bertopic import BERTopic
from pathlib import Path

In [9]:
def read_pickle(file_path):
    with open(file_path, "rb") as f:
        return pickle.load(f)

def get_model_name(model_path: Path) -> str:
    return re.match("\w+-\w+-\d+", model_path.name).group()

def create_topic_dict(raw_topic_dict: Dict[int, Tuple[str, int]]) -> Dict[int, List[str]]:
    return {k: [tup[0] for tup in lst] for k, lst in raw_topic_dict.items() if k!=-1}

def latest_full_topics(dr: Path) -> Path:
    return list(dr.glob("*full_doc_topics_*.csv"))[-1]

    

In [4]:
DATA_DIR = Path("../data")
MODEL_DIR = Path("../../ExplainlpTwitter/output")


In [None]:
model_path = ""
model = BERTopic.load(model_path)

[WindowsPath('../data/full_doc_topics_20211206_124932.csv'),
 WindowsPath('../data/full_doc_topics_20211207_132414.csv')]

In [11]:

doc_topics = pd.read_csv(latest_full_topics(DATA_DIR), index_col=0)
doc_topics
topic_words = read_pickle(DATA_DIR / "tweeteval_topic_dict.pkl" )
clean_tweets = pd.read_csv(DATA_DIR / "tweeteval_text.csv", usecols=["text"])
doc_topics = doc_topics.merge(clean_tweets, left_index=True, right_index=True)


## Looking at topic words

In [12]:
topic_dict = create_topic_dict(topic_words)
pprint(topic_dict)


{0: ['user',
     'user user',
     'love',
     'just',
     'day',
     'new',
     'tomorrow',
     'like',
     'amp',
     'im'],
 1: ['user',
     'just',
     'tomorrow',
     'night',
     'california',
     'user user',
     'day',
     'sun',
     'user just',
     'today'],
 2: ['user',
     'night',
     'friday',
     'new',
     'tomorrow',
     'nov',
     'amp',
     'check',
     'art',
     'posted'],
 3: ['happy',
     'thanks',
     'day',
     'sunday',
     'user',
     'game',
     'family',
     'happy friday',
     'weekend',
     'good'],
 4: ['user',
     'user user',
     'welfare',
     'like',
     'semst',
     'law',
     'fidel',
     'people',
     'gun',
     'manitoba'],
 5: ['user',
     'user user',
     'know',
     'trump',
     'theyre',
     'just',
     'check',
     'antifa',
     'guy',
     'cannabiscup'],
 6: ['tomorrow',
     'user',
     'day',
     'good',
     'tonight',
     'year',
     'just',
     'game',
     'come',
     'season'

In [13]:
doc_topics.groupby("topic").size()

topic
-1     67274
 0    130554
 1       643
 2       876
 3       597
 4       168
 5       238
 6       285
 7        79
 8        48
 9        23
dtype: int64

In [15]:
for top, words in topic_dict.items():
    print(f"evaluating topic {top}")
    pprint(f"{words = }")
    print(f"examples for topic {top}")
    example_tweets = doc_topics.loc[doc_topics["topic"] == top, "text"].sample(10, random_state=42).tolist()
    print(example_tweets)
    print("")


evaluating topic 0
("words = ['user', 'user user', 'love', 'just', 'day', 'new', 'tomorrow', "
 "'like', 'amp', 'im']")
examples for topic 0
['Was last night real?!?! We ️ you Dave Barnes! @ Bijou Theatre', 'oregon makes me @ Hood River, Oregon', '"Emile Garuba joins Michaela and Jerrilyn in-studio on the #EntertainmentShow to review Kunle Afolayan\'s latest film, October 1."', 'So this is what we get in our suggestion box?! Really guys? @ Industrial Strength Gym', 'One of my favorite videos. Should we start a TV show? #sandiego #encinitas #realestate #luxury…', 'Work and fun #squaw #rental @ Squaw Valley, Lake Tahoe', 'Me and Tommy in a very serious moment vanceology photo-cred dwill_photos #cleveland #friends…', '"Israel. This voluntary offering a community would hold your coat, you to the ocean, enjoying the Church, may be more useful"', 'Premios Latin Grammy @ MGM Grand Garden Arena', 'Having a great time at the beach ! Happy Fourth of July ! @ South Lake Tahoe, California']

evalu