Guide for training this model was a video by Srinivasan, S. (2020)<br>
Link: https://www.youtube.com/watch?v=25JOEnrz40c&list=PL0rtpP-8GFfR2orPIzBttl15_NfDhkujw&index=4&t=518s

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import regex as re

from time import time
from time import strftime
from time import gmtime

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition
from stop_words import get_stop_words
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

In [3]:
filename = "covid_vaccine"

In [4]:
path = "../datasets/covid_vaccine/covid_vaccine.csv"
df = pd.read_csv(path).drop("Unnamed: 0", axis=1)[["video_id", "video_title", "video_transcript"]].astype(str)
df.head()

Unnamed: 0,video_id,video_title,video_transcript
0,im3otpqYAiQ,Covid Vaccine Study Finds Links to Health Cond...,[CC may contain inaccuracies] In terms of how...
1,SkcAZfrYYXM,Two very rare COVID vaccine side effects detec...,okay we're going to finish with the guardian ...
2,7MAlEYqWUTk,Being Nice to Anti-Vaxxers,so you're against the covert vaccine oh here ...
3,uiwjAj0zfKQ,If You Get All 5 COVID Vaccines,and all right we're done now if you're feelin...
4,jPs4_MeuX7U,New Covid vaccine study links jab to heart and...,a latest covid-19 study is providing answers ...


In [5]:
patterns = []
replacements = []

# [0] Removing occurances of \xa0 and \n
patterns.append('(\\xa0|\\n)')
replacements.append(' ')

# [1] Removing text enclosed in brackets
patterns.append('\[(\w|\s)+\]')
replacements.append('')

# [2] Replacing stray '000's to 'thousand'
patterns.append('(?<=\s)000(?=\s)')
replacements.append('thousand')

# [3, 4] Mistranscriptions of the word 'COVID'
patterns.append('(?<=\s)(C|c)o(ve(r)?t|id)(?=\s)')
patterns.append('(C|c)overed(?=\s(vacc|infe))')
replacements.append('COVID')
replacements.append('COVID')

# [5] Mistranscriptions of the word 'COVID-19'
patterns.append('(?<=\s)(C|c)(oveted|o9|o\s19)(?=\s)')
replacements.append('COVID19')

# [6] Replacing '%' with the word 'percent'
patterns.append('(?<=\d)\%')
replacements.append(' percent')

# [7] Removing 'Speaker %d:' occurances
patterns.append('Speaker\s\d\:')
replacements.append('')

# [8] Removing '[\xa0__\xa0]'
patterns.append('\[\\xa0\_\_\\xa0\]')
replacements.append('')

# [9] Removing >> occurances
patterns.append('\>\>(\>+)?')
replacements.append('')

# [10] Removing 'Reporter:' occurances
patterns.append('Reporter\:')
replacements.append('')

# [11] Removing weird +@ occurances
patterns.append('\+\@')
replacements.append('')

# [12] Removing stray - occurances
patterns.append('(?<=\s)\-(\-+)?(?=\s)')
replacements.append('')

# [13] Removing text within parentheses
patterns.append('\((\w|\s)+\)')
replacements.append('')

# [14] Combining stray instances of '19' with the word 'covid' if it exists next to it
patterns.append('(covid|COVID)(\s|-)?19')
replacements.append('COVID19')

In [6]:
transcripts = df["video_transcript"].tolist()
cleaned = []
len(transcripts)

150

In [7]:
for transcript in transcripts:
    result = re.sub(patterns[0], replacements[0], transcript)
    
    for i in range(1, len(patterns)):
        result = re.sub(patterns[i], replacements[i], result)
    
    cleaned.append(result)

In [8]:
len(cleaned)

150

In [9]:
transcripts_df = pd.DataFrame(
    {
        'video_id': df["video_id"].tolist(),
        'video_title': df["video_title"].tolist(),
        'video_transcript': cleaned
    }
)
transcripts_df.head()

Unnamed: 0,video_id,video_title,video_transcript
0,im3otpqYAiQ,Covid Vaccine Study Finds Links to Health Cond...,In terms of how widespread the adverse event...
1,SkcAZfrYYXM,Two very rare COVID vaccine side effects detec...,okay we're going to finish with the guardian ...
2,7MAlEYqWUTk,Being Nice to Anti-Vaxxers,so you're against the COVID vaccine oh here w...
3,uiwjAj0zfKQ,If You Get All 5 COVID Vaccines,and all right we're done now if you're feelin...
4,jPs4_MeuX7U,New Covid vaccine study links jab to heart and...,a latest COVID19 study is providing answers t...


In [10]:
X_train, X_test = train_test_split(transcripts_df, test_size=0.3)

In [11]:
print(f"Train set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

Train set size: (105, 3)
Test set size: (45, 3)


In [12]:
transcripts = X_train['video_transcript'].tolist()
stop_words = get_stop_words('english')

In [13]:
def tokenize_and_lemmatize(transcript):
    tokens = [word.lower() for word in word_tokenize(transcript) if len(word) > 3]
    wnl = WordNetLemmatizer()
    lemmas = []
    for token in tokens:
        lemmas.append(wnl.lemmatize(token))
    
    return lemmas

In [14]:
vectorizer = CountVectorizer(
    analyzer="word",
    tokenizer=tokenize_and_lemmatize,
    stop_words=stop_words,
    max_df=0.85,
    min_df=20
)

In [15]:
matrix = vectorizer.fit_transform(transcripts)
matrix_df = pd.DataFrame(data=matrix.toarray(), columns = vectorizer.get_feature_names_out())
matrix_df



Unnamed: 0,able,actually,already,also,always,american,another,anything,around,available,...,well,whether,whole,will,without,work,working,world,yeah,year
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,2,1,0,0,1,0,0,...,3,3,0,0,1,0,0,3,0,1
2,0,0,0,0,0,3,3,3,0,0,...,3,3,6,3,0,3,3,3,0,6
3,0,1,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,1
4,0,3,0,3,0,2,2,2,0,0,...,3,1,2,4,0,2,1,2,11,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,5,0,0
101,0,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
102,2,0,3,7,0,0,0,0,0,3,...,3,1,0,9,3,0,0,0,1,4
103,0,0,0,0,0,0,0,0,2,0,...,0,0,0,1,0,0,0,0,0,4


In [16]:
vocabulary = np.array(vectorizer.get_feature_names_out())
vocabulary

array(['able', 'actually', 'already', 'also', 'always', 'american',
       'another', 'anything', 'around', 'available', 'back', 'believe',
       'better', 'blood', 'body', 'called', 'came', 'care', 'case',
       'cause', 'clear', 'come', 'coming', 'country', 'couple', 'course',
       'covid', 'covid19', 'data', 'day', 'death', 'different', 'disease',
       'doctor', 'doe', 'done', 'effect', 'enough', 'even', 'every',
       'everyone', 'everything', 'fact', 'find', 'first', 'found',
       'getting', 'give', 'given', 'going', 'good', 'government', 'group',
       'happen', 'happening', 'hard', 'health', 'heart', 'help', 'high',
       'higher', 'immune', 'immunity', 'important', 'infection',
       'information', 'issue', 'just', 'kind', 'know', 'last', 'le',
       'least', 'level', 'life', 'like', 'little', 'long', 'look',
       'looking', 'made', 'make', 'making', 'many', 'mean', 'medical',
       'might', 'million', 'month', 'mrna', 'much', 'need', 'never',
       'news', 'nu

In [17]:
lda = decomposition.LatentDirichletAllocation(n_components=10, max_iter=50)

In [18]:
doc_vectors = lda.fit_transform(matrix)
h1 = lda.components_

In [19]:
doc_vectors[0:3]

array([[1.00000000e-01, 1.00000000e-01, 1.00000000e-01, 1.00000000e-01,
        1.00000000e-01, 1.00000000e-01, 1.00000000e-01, 1.00000000e-01,
        1.00000000e-01, 1.00000000e-01],
       [4.29787077e-01, 5.61945390e-04, 1.36888902e-02, 5.52590357e-01,
        5.61960443e-04, 5.61948062e-04, 5.61942326e-04, 5.61937476e-04,
        5.61982641e-04, 5.61959521e-04],
       [9.96750237e-01, 3.61077349e-04, 3.61037889e-04, 3.61120818e-04,
        3.61088835e-04, 3.61064687e-04, 3.61070571e-04, 3.61090134e-04,
        3.61105631e-04, 3.61106697e-04]])

In [20]:
h1[0]

array([3.49161022e+01, 4.75836589e+01, 2.10824637e+01, 3.44767182e+01,
       1.71272088e+01, 8.01524933e+01, 2.21263321e+01, 3.63091243e+01,
       1.07318002e+01, 2.81404647e+00, 3.09810702e+01, 4.39417461e+01,
       2.72761135e+01, 1.35909673e+00, 1.00016813e-01, 3.18827658e+01,
       2.10487951e+01, 3.74377164e+01, 3.25754142e+01, 2.28557921e+01,
       2.78856742e+01, 4.08379361e+01, 1.00038858e-01, 9.18625755e+01,
       2.49345850e+01, 3.39248714e+01, 5.98077777e+01, 1.49019496e+01,
       7.58993501e+01, 3.08325136e+01, 4.99211866e+01, 1.00023219e-01,
       1.00024781e-01, 3.29739325e+01, 4.53830199e+01, 2.10110894e+01,
       6.93193350e+00, 1.11607083e+01, 8.04634108e+01, 3.68255627e+01,
       1.42056184e+01, 2.86769643e+01, 4.28754252e+01, 1.36221024e+01,
       4.47863029e+01, 1.00031552e-01, 3.50975221e+01, 3.01596392e+01,
       1.54904236e+01, 1.24991125e+02, 3.06841590e+01, 6.70165164e+01,
       1.52229211e+01, 2.80128259e+01, 3.21299777e+01, 2.69146088e+01,
      

In [21]:
# Code taken from the guide
num_words = 15
top_words = lambda t: [vocabulary[i] for i in np.argsort(t)[:-num_words-1:-1]]
topic_words = ([top_words(t) for t in h1])
topics = [' '.join(t) for t in topic_words]

In [22]:
topics

['people know vaccine just will like thing going want well country right said even american',
 'health news state vaccine just doctor report part will came give actually also medical right',
 'blood vaccination vaccine might doe need three second first week research cause important always number',
 'vaccine people risk covid vaccinated study covid19 effect vaccination data death case know million side',
 'health question many just care vaccine year country something covid right know american percent public',
 'vaccine mrna protein virus body system immune covid19 make pandemic like work around also will',
 'long people year think patient going just virus well covid different like many really thing',
 'vaccine protein people know covid19 study virus look just thing actually data going time death',
 'know like people think yeah just covid kind going getting right really time make still',
 'shot might also video second first vaccination data month well information right percent protein re

---