Guide for training LDA model was a video by Srinivasan, S. (2020)<br>
Link: https://www.youtube.com/watch?v=25JOEnrz40c&list=PL0rtpP-8GFfR2orPIzBttl15_NfDhkujw&index=4&t=518s

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import regex as re

from time import time
from time import strftime
from time import gmtime

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition
from stop_words import get_stop_words
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

---
### Loading the data

In [3]:
filename = "covid_vaccine"

In [4]:
path = "../datasets/covid_vaccine/videos.csv"
df = pd.read_csv(path).drop("Unnamed: 0", axis=1)[["video_id", "video_title", "video_transcript"]].astype(str)
df.head()

Unnamed: 0,video_id,video_title,video_transcript
0,2IXl4qJGrRk,A man deliberately got 217 Covid shots. Here’s...,A German man has puzzled scientists after he ...
1,HtTalpY-J-M,COVID: German man vaccinated 217 times had no ...,a 62-year-old German man from magur claims he...
2,jPs4_MeuX7U,New Covid vaccine study links jab to heart and...,a latest covid-19 study is providing answers ...
3,WhiBpmH1mE4,MAN GETS 217 COVID VACCINES! 😱😱😱 THIS is What ...,a 62-year-old man who lives in Germany uh got...
4,LfmhYVCCGhc,Joe Rogan says tons of people &quot;died sudde...,Speaker 1: This is really painful to watch. O...


---
### Cleaning the data

In [5]:
# Indices of nan transcripts
drop_indices = df[["video_id", "video_transcript"]].loc[df["video_transcript"] == 'nan'].index
drop_indices

Index([ 14,  15,  18,  20,  24,  26,  40,  59,  64,  71,  72,  80,  81,  96,
        99, 101, 109, 111, 114, 124, 125, 138, 158, 159, 184, 190],
      dtype='int64')

In [6]:
df.drop(drop_indices, inplace=True)

In [7]:
patterns = []
replacements = []

# [0] Removing occurances of \xa0 and \n
patterns.append('(\\xa0|\\n)')
replacements.append(' ')

# [1] Removing text enclosed in brackets
patterns.append('\[(\w|\s)+\]')
replacements.append('')

# [2] Replacing stray '000's to 'thousand'
patterns.append('(?<=\s)000(?=\s)')
replacements.append('thousand')

# [3, 4] Mistranscriptions of the word 'COVID'
patterns.append('(?<=\s)(C|c)o(ve(r)?t|id)(?=\s)')
patterns.append('(C|c)overed(?=\s(vacc|infe))')
replacements.append('COVID')
replacements.append('COVID')

# [5] Mistranscriptions of the word 'COVID-19'
patterns.append('(?<=\s)(C|c)(oveted|o9|o\s19)(?=\s)')
replacements.append('COVID19')

# [6] Replacing '%' with the word 'percent'
patterns.append('(?<=\d)\%')
replacements.append(' percent')

# [7] Removing 'Speaker %d:' occurances
patterns.append('Speaker\s\d\:')
replacements.append('')

# [8] Removing '[\xa0__\xa0]'
patterns.append('\[\\xa0\_\_\\xa0\]')
replacements.append('')

# [9] Removing >> occurances
patterns.append('\>\>(\>+)?')
replacements.append('')

# [10] Removing 'Reporter:' occurances
patterns.append('Reporter\:')
replacements.append('')

# [11] Removing weird +@ occurances
patterns.append('\+\@')
replacements.append('')

# [12] Removing stray - occurances
patterns.append('(?<=\s)\-(\-+)?(?=\s)')
replacements.append('')

# [13] Removing text within parentheses
patterns.append('\((\w|\s)+\)')
replacements.append('')

# [14] Combining stray instances of '19' with the word 'covid' if it exists next to it
patterns.append('(covid|COVID)(\s|-)?19')
replacements.append('COVID19')

In [8]:
transcripts = df["video_transcript"].tolist()
cleaned = []
len(transcripts)

174

In [9]:
for transcript in transcripts:
    result = re.sub(patterns[0], replacements[0], transcript)
    
    for i in range(1, len(patterns)):
        result = re.sub(patterns[i], replacements[i], result)
    
    cleaned.append(result)

In [10]:
len(cleaned)

174

In [11]:
transcripts_df = pd.DataFrame(
    {
        'video_id': df["video_id"].tolist(),
        'video_title': df["video_title"].tolist(),
        'video_transcript': cleaned
    }
)
transcripts_df.head()

Unnamed: 0,video_id,video_title,video_transcript
0,2IXl4qJGrRk,A man deliberately got 217 Covid shots. Here’s...,A German man has puzzled scientists after he ...
1,HtTalpY-J-M,COVID: German man vaccinated 217 times had no ...,a 62-year-old German man from magur claims he...
2,jPs4_MeuX7U,New Covid vaccine study links jab to heart and...,a latest COVID19 study is providing answers t...
3,WhiBpmH1mE4,MAN GETS 217 COVID VACCINES! 😱😱😱 THIS is What ...,a 62-year-old man who lives in Germany uh got...
4,LfmhYVCCGhc,Joe Rogan says tons of people &quot;died sudde...,This is really painful to watch. On the bon...


In [12]:
transcripts_df.isnull().values.any()

False

In [13]:
transcripts = transcripts_df['video_transcript'].tolist()
stop_words = get_stop_words('english')

In [14]:
def tokenize_and_lemmatize(transcript):
    tokens = [word.lower() for word in word_tokenize(transcript) if len(word) > 3]
    wnl = WordNetLemmatizer()
    lemmas = []
    for token in tokens:
        lemmas.append(wnl.lemmatize(token))
    
    return lemmas

In [17]:
vectorizer = CountVectorizer(
    analyzer="word",
    tokenizer=tokenize_and_lemmatize,
    stop_words=stop_words
)

---
### LDA

In [None]:
matrix = vectorizer.fit_transform(transcripts)
matrix_df = pd.DataFrame(data=matrix.toarray(), columns = vectorizer.get_feature_names_out())
matrix_df

In [None]:
vocabulary = np.array(vectorizer.get_feature_names_out())
vocabulary

In [None]:
lda = decomposition.LatentDirichletAllocation(n_components=10, max_iter=50)

In [None]:
doc_vectors = lda.fit_transform(matrix)
h1 = lda.components_

In [None]:
doc_vectors[0:3]

In [None]:
h1[0]

In [None]:
# Code taken from the guide
num_words = 15
top_words = lambda t: [vocabulary[i] for i in np.argsort(t)[:-num_words-1:-1]]
topic_words = ([top_words(t) for t in h1])
topics = [' '.join(t) for t in topic_words]

In [None]:
topics

---
### BerTopic

In [20]:
from bertopic import BERTopic
# topic_model = BERTopic.load("MaartenGr/BERTopic_Wikipedia")
topic_model = BERTopic(vectorizer_model=vectorizer)
topics, probs = topic_model.fit_transform(transcripts)

In [25]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,70,-1_vaccine_know_people_dont,"[vaccine, know, people, dont, just, shot, that...",[ well a warm welcome to this talk and a parti...
1,0,48,0_vaccine_people_covid_know,"[vaccine, people, covid, know, just, like, tha...",[ We’re entering the third year of the pandemi...
2,1,34,1_vaccine_people_will_like,"[vaccine, people, will, like, virus, work, fir...",[ we're going to bring you right up to date wi...
3,2,22,2_vaccine_heart_patient_protein,"[vaccine, heart, patient, protein, spike, myoc...",[ hey guys welcome to another video yesterday ...


In [23]:
topics[0:10]

[-1, -1, 2, -1, 0, -1, -1, 2, 2, 0]

In [24]:
probs[0:10]

array([0.        , 0.        , 1.        , 0.        , 1.        ,
       0.        , 0.        , 1.        , 1.        , 0.87745557])

---