In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import regex as re

from time import time
from time import strftime
from time import gmtime

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split

In [3]:
filename = "covid_vaccine"

In [4]:
path = "../datasets/covid_vaccine/covid_vaccine.csv"
df = pd.read_csv(path).drop("Unnamed: 0", axis=1)[["video_id", "video_title", "video_transcript"]].astype(str)
df.head()

Unnamed: 0,video_id,video_title,video_transcript
0,im3otpqYAiQ,Covid Vaccine Study Finds Links to Health Cond...,[CC may contain inaccuracies] In terms of how...
1,SkcAZfrYYXM,Two very rare COVID vaccine side effects detec...,okay we're going to finish with the guardian ...
2,7MAlEYqWUTk,Being Nice to Anti-Vaxxers,so you're against the covert vaccine oh here ...
3,uiwjAj0zfKQ,If You Get All 5 COVID Vaccines,and all right we're done now if you're feelin...
4,jPs4_MeuX7U,New Covid vaccine study links jab to heart and...,a latest covid-19 study is providing answers ...


In [5]:
patterns = []

# [0] Removing occurances of \xa0 and \n
patterns.append('(\\xa0|\\n)')

# [1] Removing text enclosed in brackets
patterns.append('\[(\w|\s)+\]')

# [2] Replacing stray '000's to 'thousand'
patterns.append('(?<=\s)000(?=\s)')

# [3, 4] Mistranscriptions of the word 'COVID'
patterns.append('(?<=\s)(C|c)o(ve(r)?t|id)(?=\s)')
patterns.append('(C|c)overed(?=\s(vacc|infe))')

# [5] Mistranscriptions of the word 'COVID-19'
patterns.append('(?<=\s)(C|c)(oveted|o9|o\s19)(?=\s)')

# [6] Replacing '%' with the word 'percent'
patterns.append('(?<=\d)\%')

# [7] Removing 'Speaker %d:' occurances
patterns.append('Speaker\s\d\:')

# [8] Removing '[\xa0__\xa0]'
patterns.append('\[\\xa0\_\_\\xa0\]')

# [9] Removing >> occurances
patterns.append('\>\>(\>+)?')

# [10] Removing 'Reporter:' occurances
patterns.append('Reporter\:')

# [11] Removing weird +@ occurances
patterns.append('\+\@')

# [12] Removing stray - occurances
patterns.append('(?<=\s)\-(\-+)?(?=\s)')

# [13] Removing text within parentheses
patterns.append('\((\w|\s)+\)')

In [6]:
transcripts = df["video_transcript"].tolist()
cleaned = []
len(transcripts)

150

In [7]:
for transcript in transcripts:
    result = re.sub(patterns[0], ' ', transcript)
    result = re.sub(patterns[1], '', result)
    result = re.sub(patterns[2], 'thousand', result)
    result = re.sub(patterns[3], 'COVID', result)
    result = re.sub(patterns[4], 'COVID', result)
    result = re.sub(patterns[5], 'COVID-19', result)
    result = re.sub(patterns[6], ' percent', result)
    result = re.sub(patterns[7], '', result)
    result = re.sub(patterns[8], '', result)
    result = re.sub(patterns[9], '', result)
    result = re.sub(patterns[10], '', result)
    result = re.sub(patterns[11], '', result)
    result = re.sub(patterns[12], '', result)
    result = re.sub(patterns[13], '', result)
    
    cleaned.append(result)

In [8]:
len(cleaned)

150

In [9]:
transcripts_df = pd.DataFrame(
    {
        'video_id': df["video_id"].tolist(),
        'video_title': df["video_title"].tolist(),
        'video_transcript': cleaned
    }
)
transcripts_df.head()

Unnamed: 0,video_id,video_title,video_transcript
0,im3otpqYAiQ,Covid Vaccine Study Finds Links to Health Cond...,In terms of how widespread the adverse event...
1,SkcAZfrYYXM,Two very rare COVID vaccine side effects detec...,okay we're going to finish with the guardian ...
2,7MAlEYqWUTk,Being Nice to Anti-Vaxxers,so you're against the COVID vaccine oh here w...
3,uiwjAj0zfKQ,If You Get All 5 COVID Vaccines,and all right we're done now if you're feelin...
4,jPs4_MeuX7U,New Covid vaccine study links jab to heart and...,a latest covid-19 study is providing answers ...


In [10]:
X_train, X_test = train_test_split(transcripts_df, test_size=0.6)

In [11]:
stemmer = SnowballStemmer("english")
stemmer

<nltk.stem.snowball.SnowballStemmer at 0x1ba77b024d0>

In [15]:
tokenized = []
transcripts = X_train['video_transcript'].tolist()
stop_words = set(stopwords.words('english'))

for transcript in transcripts:
    tokens = word_tokenize(transcript)
    filtered = [word.lower() for word in tokens if not word.lower() in stop_words]
    # stems = [stemmer.stem(word) for word in filtered]
    print(filtered)

['dozens', 'dozens', 'vaccines', 'available', 'last', 'many', 'decades', 'seen', 'absolutely', 'long-term', 'side', 'effects', 'vaccines', "'re", 'year', 'first', 'people', 'got', 'vaccinated', 'covid', 'still', 'seen', 'long-term', 'effects', 'real', 'risks', "'ve", 'hearing', 'inflammation', 'heart', 'blood', 'clots', 'possible', 'happened', 'tiny', 'fraction', 'people', 'gotten', 'covid', 'vaccine', "'re", 'talking', 'maybe', 'five', 'per', 'million', 'look', 'risk', 'kovid', 'risks', 'dying', 'much', 'greater', 'tiny', 'risk', 'vaccine', 'certainly', 'something', 'want', 'overcome', 'get', 'vaccine', 'keep', 'safe', 'covenant']
['revamped', 'hip-hop', 'song', 'like', 'vax', 'thing', 'one', 'many', 'ways', 'leaders', 'work', 'get', 'americans', 'vaccinated', 'covid-19', 'pandemic', 'today', 'energy', 'waned', 'news', 'negative', 'side', 'effects', 'including', 'long', 'vac', 'fatigue', 'post', 'exertion', 'malays', 'brain', 'fog', "'s", 'whole', 'side', 'list', 'symptoms', 'largest'