# Notebook for exploring BERTopic models
Once the model is cleaned, use this notebook to explore the topics, documents and extract bigrams etc.

Load models

In [None]:
from bertopic import BERTopic
import pandas as pd
import numpy as np
import pickle    
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text
import re
from collections import Counter
#from wordcloud import WordCloud
#import matplotlib.pyplot as plt

In [None]:
loadPath = "/Users/ipinni/Library/CloudStorage/OneDrive-UniversityofLeeds/UKRI_Tweet_Data/completed/"

In [None]:
COP20model = BERTopic.load(loadPath +"COP20/COP20_Bert_model", embedding_model = "all-mpnet-base-v2")

In [None]:
COP20model.get_topic_info()

In [None]:
with open(loadPath +'COP20/COP20topics.list', 'rb') as config_list_file:   
    COP20topics = pickle.load(config_list_file)

In [None]:
COP21model = BERTopic.load(loadPath +"COP21/COP21_Bert_model", embedding_model = "all-mpnet-base-v2")

In [None]:
with open(loadPath +'COP21/COP21topics.list', 'rb') as config_list_file:   
    COP21topics = pickle.load(config_list_file)

In [None]:
COP22model = BERTopic.load(loadPath +"COP22/COP22_Bert_model", embedding_model = "all-mpnet-base-v2")

In [None]:
with open(loadPath +'COP22/COP22topics.list', 'rb') as config_list_file:   
    COP22topics = pickle.load(config_list_file)

In [None]:
COP23model = BERTopic.load(loadPath +"COP23/COP23_Bert_model", embedding_model = "all-mpnet-base-v2")

In [None]:
with open(loadPath +'COP23/COP23topics.list', 'rb') as config_list_file:   
    COP23topics = pickle.load(config_list_file)

In [None]:
COP24model = BERTopic.load(loadPath +"COP24/COP24_Bert_model", embedding_model = "all-mpnet-base-v2")

In [None]:
with open(loadPath +'COP24/COP24topics.list', 'rb') as config_list_file:   
    COP24topics = pickle.load(config_list_file)

In [None]:
COP25model = BERTopic.load(loadPath +"COP25/COP25_Bert_model", embedding_model = "all-mpnet-base-v2")

In [None]:
with open(loadPath +'COP25/COP25topics.list', 'rb') as config_list_file:   
    COP25topics = pickle.load(config_list_file)

In [None]:
COP26model = BERTopic.load(loadPath +"COP26/COP26_Bert_model", embedding_model = "all-mpnet-base-v2")

In [None]:
with open(loadPath +'COP26/COP26topics.list', 'rb') as config_list_file:   
    COP26topics = pickle.load(config_list_file)

In [None]:
COP23model.get_topic_info()

In [None]:
version = "FFF2018"

In [None]:
def get_data(version):

    model = BERTopic.load(loadPath + version + "/" + version + "_Bert_model", embedding_model = "all-mpnet-base-v2")

    with open(loadPath + version + "/" + version + "topics.list" ,'rb') as config_list_file:   
        topics = pickle.load(config_list_file)

    with open(loadPath + version + "/" + version + "docs.list", 'rb') as docs_list_file:   
        docs = pickle.load(docs_list_file)

    return(topics, docs, model)

#topics, docs, model = get_data()

In [None]:
FFF2018topics, FFF2018docs, FFF2018model = get_data(version = "FFF2018")
FFF2019topics, FFF2019docs, FFF2019model = get_data(version = "FFF2019")
FFF2020topics, FFF2020docs, FFF2020model = get_data(version = "FFF2020")
FFF2021topics, FFF2021docs, FFF2021model = get_data(version = "FFF2021")

Visualise the probability distribution of a single document

In [None]:
model.visualize_distribution(probs[20])

In [None]:
from collections import defaultdict

Add the retweets to the final topic counts

In [None]:
def get_full_counts(cleanTweets_file, model, topics):
    #load clean tweets file
    cleanTweets = pd.read_csv(cleanTweets_file,header = 0, index_col=0, dtype= {'tweet_id': 'str', 'text': 'str', 'like_count': 'float', 'retweet_count': 'float'}, lineterminator='\n')
    #extract retweets for each document
    retweets = list(cleanTweets.retweet_count)
    list_tuple = list(zip(topics,retweets))
    #extract retweets for each topic
    res = defaultdict(int)
    for k, v in list_tuple:
        res[k] += v
    
    for key, value in res.items():
        if value != value:
            res[key] = 0.0
    #get original counts
    freqs = model.get_topic_info().sort_values('Topic')
    A = Counter(dict(res.items()))
    B = Counter(dict(zip(freqs.Topic, freqs.Count)))
    #combine with retweets and create new df
    C = A + B
    D = pd.DataFrame(C.items(), columns = ['Topic', 'FullCount']).sort_values('Topic')
    E = pd.DataFrame(zip(D.Topic, D.FullCount, freqs.Name), columns= ['Topic', 'FullCount', 'Name'])
    return(E)
  


In [None]:
COP20counts = get_full_counts(loadPath + "COP20/COP20CleanTweets.csv", COP20model, COP20topics)
COP20counts.to_csv(loadPath + "COP20/COP20Counts.csv")

In [None]:
COP21counts = get_full_counts(loadPath + "COP21/COP21CleanTweets.csv", COP21model, COP21topics)
COP21counts.to_csv(loadPath + "COP21/COP21Counts.csv")

In [None]:
COP22counts = get_full_counts(loadPath + "COP22/COP22CleanTweets.csv", COP22model, COP22topics)
COP22counts.to_csv(loadPath + "COP22/COP22Counts.csv")

In [None]:
COP22counts

In [None]:
COP23counts = get_full_counts(loadPath + "COP23/COP23CleanTweets.csv", COP23model, COP23topics)
COP23counts.to_csv(loadPath + "COP23/COP23Counts.csv")

In [None]:
COP24counts = get_full_counts(loadPath + "COP24/COP24CleanTweets.csv", COP24model, COP24topics)
COP24counts.to_csv(loadPath + "COP24/COP24Counts.csv")

In [None]:
COP25counts = get_full_counts(loadPath + "COP25/COP25CleanTweets.csv", COP25model, COP25topics)
COP25counts.to_csv(loadPath + "COP25/COP25Counts.csv")

In [None]:
COP26counts = get_full_counts(loadPath + "COP26/COP26CleanTweets.csv", COP26model, COP26topics)
COP26counts.to_csv(loadPath + "COP26/COP26Counts.csv")

In [None]:
FFF2018counts = get_full_counts(loadPath + "FFF2018/FFF2018CleanTweets.csv", FFF2018model, FFF2018topics)
FFF2018counts.to_csv(loadPath + "FFF2018/FFF2018Counts.csv")

In [None]:
FFF2019counts = get_full_counts(loadPath + "FFF2019/FFF2019CleanTweets.csv", FFF2019model, FFF2019topics)
FFF2019counts.to_csv(loadPath + "FFF2019/FFF2019Counts.csv")

In [None]:
FFF2020counts = get_full_counts(loadPath + "FFF2020/FFF2020CleanTweets.csv", FFF2020model, FFF2020topics)
FFF2020counts.to_csv(loadPath + "FFF2020/FFF2020Counts.csv")

In [None]:
FFF2021counts = get_full_counts(loadPath + "FFF2021/FFF2021CleanTweets.csv", FFF2021model, FFF2021topics)
FFF2021counts.to_csv(loadPath + "FFF2021/FFF2021Counts.csv")

In [None]:
COP23fig = COP23model.visualize_topics()

In [None]:
COP23fig.write_html(loadPath + "COP23/COP23Topics.html")

In [None]:
COP22model.visualize_term_rank()

In [None]:
COP23model.get_topic(30)

Similarity between documents and topics using Jensen Shannon/KL

In [None]:
from scipy.spatial.distance import jensenshannon
from numpy import asarray

get topics similar to a search term

In [None]:
model.find_topics(["youth", 'greta', 'threat', 'human rights'], top_n=5)

In [None]:
representative_docs = model.get_representative_docs()
data = pd.DataFrame.from_dict(representative_docs, orient='index')


In [None]:
embs=model.topic_embeddings

In [None]:
len(embs[0])

In [None]:
len(model.topic_embeddings)

In [None]:
umap=model2.umap_model

In [None]:
model2.umap_model

In [None]:
umap_data = umap.fit_transform(embs)
result = pd.DataFrame(umap_data, columns=['x', 'y'])
result['labels'] = clusterer.labels_

In [None]:
clusterer = model.hdbscan_model

In [None]:
tree=clusterer.condensed_tree_
clusters = tree._select_clusters()

In [None]:
my_stopwords = list(["rt","RT", "&", "amp", "&amp", "http","https", "http://", "https://", "fav", "FAV"])
new_stopwords = frozenset(list(text.ENGLISH_STOP_WORDS) + my_stopwords)
vectorizer = CountVectorizer(stop_words=new_stopwords, min_df=10)
count_matrix = vectorizer.fit_transform(docs)

In [None]:
vocab = vectorizer.get_feature_names()

In [None]:
count_matrix2 = vectorizer.fit_transform(words2)

In [None]:
len(vectorizer.get_feature_names())