In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import regex as re

In [4]:
filename = "covid_vaccine"

In [27]:
path = "../datasets/covid_vaccine/covid_vaccine.csv"
df = pd.read_csv(path).drop("Unnamed: 0", axis=1)[["video_id", "video_title", "video_transcript"]]
df.head()

Unnamed: 0,video_id,video_title,video_transcript
0,im3otpqYAiQ,Covid Vaccine Study Finds Links to Health Cond...,[CC may contain inaccuracies] In terms of how...
1,uiwjAj0zfKQ,If You Get All 5 COVID Vaccines,and all right we're done now if you're feelin...
2,LfmhYVCCGhc,Joe Rogan says tons of people &quot;died sudde...,Speaker 1: This is really painful to watch. O...
3,SU_SSfiYtfM,New COVID booster coming soon,what are those our new fall arsenal of vaccin...
4,7MAlEYqWUTk,Being Nice to Anti-Vaxxers,so you're against the covert vaccine oh here ...


In [28]:
patterns = []

# Removing occurances of \xa0 and \n
patterns.append(r"\xa0|\n")

# Removing text enclosed in brackets
patterns.append(r"\[\w+\]")

# Replacing stray '000's to 'thousands'
patterns.append(r"(?<=\s)000(?=\s)")

# Mistranscriptions of the word 'COVID'
patterns.append(r"(?<=\s)co(ve(r)?t|id)(?=\s)")

# Mistranscriptions of the word 'COVID-19'
patterns.append(r"(?<=\s)coveted(?=\s)")

# Removing forward slash from escaped apostrophes
patterns.append(r"\\(?=\')")

# Replacing '%' with the word 'percent'
patterns.append(r"(?<=\d)\%")

# Removing 'Speaker %d:' occurances
patterns.append(r"Speaker\s\d\:")

# Removing '[\xa0__\xa0]'
patterns.append(r"\[\\xa0\_\_\\xa0\]")

patterns

['\\xa0|\\n',
 '\\[\\w+\\]',
 '(?<=\\s)000(?=\\s)',
 '(?<=\\s)co(ve(r)?t|id)(?=\\s)',
 '(?<=\\s)coveted(?=\\s)',
 "\\\\(?=\\')",
 '(?<=\\d)\\%',
 'Speaker\\s\\d\\:',
 '\\[\\\\xa0\\_\\_\\\\xa0\\]']

In [30]:
df=df.dropna(subset=["video_transcript"])

In [31]:
df

Unnamed: 0,video_id,video_title,video_transcript
0,im3otpqYAiQ,Covid Vaccine Study Finds Links to Health Cond...,[CC may contain inaccuracies] In terms of how...
1,uiwjAj0zfKQ,If You Get All 5 COVID Vaccines,and all right we're done now if you're feelin...
2,LfmhYVCCGhc,Joe Rogan says tons of people &quot;died sudde...,Speaker 1: This is really painful to watch. O...
3,SU_SSfiYtfM,New COVID booster coming soon,what are those our new fall arsenal of vaccin...
4,7MAlEYqWUTk,Being Nice to Anti-Vaxxers,so you're against the covert vaccine oh here ...
...,...,...,...
145,-CCaJLR1zhQ,Pfizer did not know whether Covid vaccine stop...,was the Pfizer covet vaccine tested on stoppi...
146,-gq51K9fi_s,Florida doctor&#39;s death after receiving COV...,>> WE HAVE REACTION FROM THAT >> WE HAVE REAC...
147,Smfjyy9cgV4,Beyond the Noise #6: Do We Really Need a Yearl...,from micro TV this is beyond the Noise episod...
148,iMx7kmld9sk,Idaho lawmakers introduce legislation to crimi...,A bill has been introduced in the Idaho legis...


In [40]:
from textblob import TextBlob

def textblob_subjectivity(df):
    subjectivity = {}
    for entry in df["video_transcript"]:
        testimonial = TextBlob(entry)
        subjectivity[entry] = testimonial.sentiment.subjectivity
        
    return subjectivity

In [None]:
textblob_subjectivity(df)

In [39]:
df

Unnamed: 0,video_id,video_title,video_transcript,subjectivity
0,im3otpqYAiQ,Covid Vaccine Study Finds Links to Health Cond...,[CC may contain inaccuracies] In terms of how...,0.447934
1,uiwjAj0zfKQ,If You Get All 5 COVID Vaccines,and all right we're done now if you're feelin...,0.576531
2,LfmhYVCCGhc,Joe Rogan says tons of people &quot;died sudde...,Speaker 1: This is really painful to watch. O...,0.535842
3,SU_SSfiYtfM,New COVID booster coming soon,what are those our new fall arsenal of vaccin...,0.463934
4,7MAlEYqWUTk,Being Nice to Anti-Vaxxers,so you're against the covert vaccine oh here ...,0.513971
...,...,...,...,...
145,-CCaJLR1zhQ,Pfizer did not know whether Covid vaccine stop...,was the Pfizer covet vaccine tested on stoppi...,0.341905
146,-gq51K9fi_s,Florida doctor&#39;s death after receiving COV...,>> WE HAVE REACTION FROM THAT >> WE HAVE REAC...,0.340300
147,Smfjyy9cgV4,Beyond the Noise #6: Do We Really Need a Yearl...,from micro TV this is beyond the Noise episod...,0.443136
148,iMx7kmld9sk,Idaho lawmakers introduce legislation to crimi...,A bill has been introduced in the Idaho legis...,0.500000
