Guide for training LDA model was a video by Srinivasan, S. (2020)<br>
Link: https://www.youtube.com/watch?v=25JOEnrz40c&list=PL0rtpP-8GFfR2orPIzBttl15_NfDhkujw&index=4&t=518s

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import regex as re

from time import time
from time import strftime
from time import gmtime

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition
from stop_words import get_stop_words
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

---
### Loading the data

In [3]:
filename = "covid_vaccine"

In [4]:
path = "../datasets/covid_vaccine/covid_vaccine.csv"
df = pd.read_csv(path).drop("Unnamed: 0", axis=1)[["video_id", "video_title", "video_transcript"]].astype(str)
df.head()

Unnamed: 0,video_id,video_title,video_transcript
0,im3otpqYAiQ,Covid Vaccine Study Finds Links to Health Cond...,[CC may contain inaccuracies] In terms of how...
1,SkcAZfrYYXM,Two very rare COVID vaccine side effects detec...,okay we're going to finish with the guardian ...
2,7MAlEYqWUTk,Being Nice to Anti-Vaxxers,so you're against the covert vaccine oh here ...
3,uiwjAj0zfKQ,If You Get All 5 COVID Vaccines,and all right we're done now if you're feelin...
4,jPs4_MeuX7U,New Covid vaccine study links jab to heart and...,a latest covid-19 study is providing answers ...


---
### Cleaning the data

In [7]:
# Indices of nan transcripts
drop_indices = df[["video_id", "video_transcript"]].loc[df["video_transcript"] == 'nan'].index
drop_indices

Index([  7,  15,  37,  43,  45,  48,  52,  56,  69,  77,  81,  83,  93,  95,
       104, 111, 117, 122, 134, 136, 137, 143],
      dtype='int64')

In [8]:
df.drop(drop_indices, inplace=True)

In [9]:
patterns = []
replacements = []

# [0] Removing occurances of \xa0 and \n
patterns.append('(\\xa0|\\n)')
replacements.append(' ')

# [1] Removing text enclosed in brackets
patterns.append('\[(\w|\s)+\]')
replacements.append('')

# [2] Replacing stray '000's to 'thousand'
patterns.append('(?<=\s)000(?=\s)')
replacements.append('thousand')

# [3, 4] Mistranscriptions of the word 'COVID'
patterns.append('(?<=\s)(C|c)o(ve(r)?t|id)(?=\s)')
patterns.append('(C|c)overed(?=\s(vacc|infe))')
replacements.append('COVID')
replacements.append('COVID')

# [5] Mistranscriptions of the word 'COVID-19'
patterns.append('(?<=\s)(C|c)(oveted|o9|o\s19)(?=\s)')
replacements.append('COVID19')

# [6] Replacing '%' with the word 'percent'
patterns.append('(?<=\d)\%')
replacements.append(' percent')

# [7] Removing 'Speaker %d:' occurances
patterns.append('Speaker\s\d\:')
replacements.append('')

# [8] Removing '[\xa0__\xa0]'
patterns.append('\[\\xa0\_\_\\xa0\]')
replacements.append('')

# [9] Removing >> occurances
patterns.append('\>\>(\>+)?')
replacements.append('')

# [10] Removing 'Reporter:' occurances
patterns.append('Reporter\:')
replacements.append('')

# [11] Removing weird +@ occurances
patterns.append('\+\@')
replacements.append('')

# [12] Removing stray - occurances
patterns.append('(?<=\s)\-(\-+)?(?=\s)')
replacements.append('')

# [13] Removing text within parentheses
patterns.append('\((\w|\s)+\)')
replacements.append('')

# [14] Combining stray instances of '19' with the word 'covid' if it exists next to it
patterns.append('(covid|COVID)(\s|-)?19')
replacements.append('COVID19')

In [10]:
transcripts = df["video_transcript"].tolist()
cleaned = []
len(transcripts)

128

In [11]:
for transcript in transcripts:
    result = re.sub(patterns[0], replacements[0], transcript)
    
    for i in range(1, len(patterns)):
        result = re.sub(patterns[i], replacements[i], result)
    
    cleaned.append(result)

In [12]:
len(cleaned)

128

In [13]:
transcripts_df = pd.DataFrame(
    {
        'video_id': df["video_id"].tolist(),
        'video_title': df["video_title"].tolist(),
        'video_transcript': cleaned
    }
)
transcripts_df.head()

Unnamed: 0,video_id,video_title,video_transcript
0,im3otpqYAiQ,Covid Vaccine Study Finds Links to Health Cond...,In terms of how widespread the adverse event...
1,SkcAZfrYYXM,Two very rare COVID vaccine side effects detec...,okay we're going to finish with the guardian ...
2,7MAlEYqWUTk,Being Nice to Anti-Vaxxers,so you're against the COVID vaccine oh here w...
3,uiwjAj0zfKQ,If You Get All 5 COVID Vaccines,and all right we're done now if you're feelin...
4,jPs4_MeuX7U,New Covid vaccine study links jab to heart and...,a latest COVID19 study is providing answers t...


---
### Train test split

In [14]:
X_train, X_test = train_test_split(transcripts_df, test_size=0.3)

In [15]:
print(f"Train set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

Train set size: (89, 3)
Test set size: (39, 3)


---
### LDA

In [16]:
transcripts = X_train['video_transcript'].tolist()
stop_words = get_stop_words('english')

In [17]:
def tokenize_and_lemmatize(transcript):
    tokens = [word.lower() for word in word_tokenize(transcript) if len(word) > 3]
    wnl = WordNetLemmatizer()
    lemmas = []
    for token in tokens:
        lemmas.append(wnl.lemmatize(token))
    
    return lemmas

In [18]:
vectorizer = CountVectorizer(
    analyzer="word",
    tokenizer=tokenize_and_lemmatize,
    stop_words=stop_words,
    max_df=0.85,
    min_df=20
)

In [19]:
matrix = vectorizer.fit_transform(transcripts)
matrix_df = pd.DataFrame(data=matrix.toarray(), columns = vectorizer.get_feature_names_out())
matrix_df



Unnamed: 0,actually,already,also,another,around,back,believe,better,blood,body,...,want,week,well,whether,will,work,working,world,yeah,year
0,0,0,0,0,0,1,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
1,1,0,5,2,0,0,0,2,4,1,...,1,3,2,0,0,2,0,0,0,2
2,1,0,2,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,3,0,3,3,1,3,0,0,0,0,...,1,0,0,1,0,5,1,2,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,4,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,6,5,7,6,10,4,2,1,1,3,...,7,6,6,3,5,7,3,6,3,5
85,5,2,5,1,1,1,0,1,0,3,...,0,2,4,1,4,1,2,0,0,6
86,2,1,2,1,0,4,0,1,0,0,...,1,2,4,1,3,1,0,1,0,1
87,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,1


In [20]:
vocabulary = np.array(vectorizer.get_feature_names_out())
vocabulary

array(['actually', 'already', 'also', 'another', 'around', 'back',
       'believe', 'better', 'blood', 'body', 'called', 'care', 'case',
       'cause', 'come', 'coming', 'country', 'course', 'covid', 'covid19',
       'data', 'day', 'different', 'doctor', 'doe', 'done', 'effect',
       'even', 'every', 'fact', 'feel', 'find', 'first', 'five', 'found',
       'getting', 'give', 'given', 'going', 'good', 'group', 'health',
       'heart', 'help', 'higher', 'important', 'infection', 'information',
       'just', 'keep', 'kind', 'know', 'last', 'life', 'like', 'little',
       'long', 'look', 'made', 'make', 'many', 'mean', 'medical', 'might',
       'million', 'month', 'mrna', 'much', 'need', 'never', 'news',
       'next', 'number', 'okay', 'pandemic', 'patient', 'people',
       'percent', 'point', 'problem', 'question', 'really', 'report',
       'research', 'right', 'risk', 'safe', 'said', 'say', 'saying',
       'second', 'seen', 'shot', 'show', 'side', 'small', 'someone',
       

In [21]:
lda = decomposition.LatentDirichletAllocation(n_components=10, max_iter=50)

In [22]:
doc_vectors = lda.fit_transform(matrix)
h1 = lda.components_

In [23]:
doc_vectors[0:3]

array([[1.04172058e-01, 4.16796446e-03, 4.16775862e-03, 4.16689572e-03,
        8.33664640e-02, 5.11114395e-01, 4.16886233e-03, 4.16808168e-03,
        4.17087207e-03, 2.76336648e-01],
       [4.87961546e-04, 4.87972837e-04, 4.87919525e-04, 1.13521218e-01,
        4.87953393e-04, 4.87950257e-04, 4.87926588e-04, 2.84775087e-01,
        3.26457381e-02, 5.66130272e-01],
       [5.90743576e-01, 2.55075962e-01, 3.57282026e-03, 3.57260971e-03,
        3.57151631e-03, 3.57284318e-03, 1.29173386e-01, 3.57255806e-03,
        3.57236188e-03, 3.57236683e-03]])

In [24]:
h1[0]

array([12.25482326,  0.10001946, 15.01137852,  0.1000068 , 18.97997304,
       19.2593669 ,  0.10001169,  0.10000407,  0.10002454, 44.16704374,
        0.10000218,  0.10000787,  0.10000615,  0.10000457,  0.10001185,
        0.100007  ,  0.10000832,  0.10000729, 24.93964483, 23.04548946,
        0.10000481, 12.24875066,  0.10000456, 18.62942298,  0.10000563,
        0.10001149,  0.10000974,  0.10000538,  0.10001003,  0.10000488,
       26.58916879,  0.10000787, 27.41253437, 60.93855093,  0.10000636,
       20.21591019, 14.15231561,  0.10000956,  0.10000914,  0.10001287,
        0.10000361, 44.83568776,  0.1000075 , 42.13576707,  0.10000373,
        0.10000414,  0.10001001,  9.90572737, 65.89373249,  0.10004845,
       15.88604088, 22.36817876,  0.10000801, 10.60929644,  0.10000837,
        0.10000755,  0.10001032, 13.61248922,  0.1000042 , 21.68981437,
        0.10001482,  0.10000721, 15.16802528,  0.10000836,  0.10000237,
        0.10001079,  0.10000335,  0.10000618, 25.11296364,  0.10

In [25]:
# Code taken from the guide
num_words = 15
top_words = lambda t: [vocabulary[i] for i in np.argsort(t)[:-num_words-1:-1]]
topic_words = ([top_words(t) for t in h1])
topics = [' '.join(t) for t in topic_words]

In [26]:
topics

['shot just will five right health body state help thing news number three first feel',
 'vaccinated data people risk group might percent just look vaccination time effect actually study side',
 'health question many risk care study report also covid19 doe say just mrna getting different',
 'like know think yeah getting just feel right time really doe going day kind said',
 'blood heart research small side seen made said last another pandemic month country also trying',
 'covid know people risk study like million data really thing right vaccination kind number okay',
 'people long think virus thing patient different well really going work just know like infection',
 'people know just like going will thing year want well think covid said time right',
 'mrna virus body system make shot also covid19 called around second important work information start',
 'people covid19 vaccination risk case cause will covid also doe video medical blood different made']

---
### Top2Vec

In [31]:
from top2vec import Top2Vec

In [32]:
transcripts_df

Unnamed: 0,video_id,video_title,video_transcript
0,im3otpqYAiQ,Covid Vaccine Study Finds Links to Health Cond...,In terms of how widespread the adverse event...
1,SkcAZfrYYXM,Two very rare COVID vaccine side effects detec...,okay we're going to finish with the guardian ...
2,7MAlEYqWUTk,Being Nice to Anti-Vaxxers,so you're against the COVID vaccine oh here w...
3,uiwjAj0zfKQ,If You Get All 5 COVID Vaccines,and all right we're done now if you're feelin...
4,jPs4_MeuX7U,New Covid vaccine study links jab to heart and...,a latest COVID19 study is providing answers t...
...,...,...,...
123,o2HLkFi4Qtw,5 things NOT TO DO after getting the COVID-19 ...,HAPPENING HAPPENING ACROSS HAPPENING ACROSS T...
124,o-yTrL5aszM,#shorts - COVID Vaccine &amp; Green Card appli...,so what's the current update on COVID19 vacci...
125,nhb1zIYXUP8,Nurse faints after getting COVID vaccine,meanwhile we're seeing different reactions to...
126,It7VNzhAqOs,Blood Clots after COVID Vaccine,"Hello, welcome to my channel Medicine with D..."


In [None]:
# pip install tensorflow tensorflow_hub
t2v_model = Top2Vec(transcripts_df["video_transcript"].tolist())

---

In [5]:
from bertopic import BERTopic
topic_model = BERTopic()

ImportError: cannot import name 'is_nltk_available' from 'transformers.utils.import_utils' (C:\Users\geloa\anaconda3\Lib\site-packages\transformers\utils\import_utils.py)