In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import regex as re

In [2]:
filename = "covid_vaccine"

In [3]:
path = "../datasets/" + filename + "/videos.csv"
df = pd.read_csv(path, index_col=0)[["video_id", "video_title", "video_transcript"]]
df.head()

Unnamed: 0,video_id,video_title,video_transcript
0,2IXl4qJGrRk,A man deliberately got 217 Covid shots. Here’s...,A German man has puzzled scientists after he ...
1,HtTalpY-J-M,COVID: German man vaccinated 217 times had no ...,a 62-year-old German man from magur claims he...
2,jPs4_MeuX7U,New Covid vaccine study links jab to heart and...,a latest covid-19 study is providing answers ...
3,WhiBpmH1mE4,MAN GETS 217 COVID VACCINES! 😱😱😱 THIS is What ...,a 62-year-old man who lives in Germany uh got...
4,LfmhYVCCGhc,Joe Rogan says tons of people &quot;died sudde...,Speaker 1: This is really painful to watch. O...


In [4]:
# Indices of nan transcripts
drop_indices = df[["video_id", "video_transcript"]].loc[df["video_transcript"] == 'nan'].index
drop_indices

Index([], dtype='int64')

In [5]:
df.drop(drop_indices, inplace=True)

In [6]:
patterns = []
replacements = []

# [0] Removing occurances of \xa0 and \n
patterns.append('(\\xa0|\\n)')
replacements.append(' ')

# [1] Removing text enclosed in brackets
patterns.append('\[(\w|\s)+\]')
replacements.append('')

# [2] Replacing stray '000's to 'thousand'
patterns.append('(?<=\s)000(?=\s)')
replacements.append('thousand')

# [3, 4] Mistranscriptions of the word 'COVID'
patterns.append('(?<=\s)(C|c)o(ve(r)?t|id)(?=\s)')
patterns.append('(C|c)overed(?=\s(vacc|infe))')
replacements.append('COVID')
replacements.append('COVID')

# [5] Mistranscriptions of the word 'COVID-19'
patterns.append('(?<=\s)(C|c)(oveted|o9|o\s19)(?=\s)')
replacements.append('COVID19')

# [6] Replacing '%' with the word 'percent'
patterns.append('(?<=\d)\%')
replacements.append(' percent')

# [7] Removing 'Speaker %d:' occurances
patterns.append('Speaker\s\d\:')
replacements.append('')

# [8] Removing '[\xa0__\xa0]'
patterns.append('\[\\xa0\_\_\\xa0\]')
replacements.append('')

# [9] Removing >> occurances
patterns.append('\>\>(\>+)?')
replacements.append('')

# [10] Removing 'Reporter:' occurances
patterns.append('Reporter\:')
replacements.append('')

# [11] Removing weird +@ occurances
patterns.append('\+\@')
replacements.append('')

# [12] Removing stray - occurances
patterns.append('(?<=\s)\-(\-+)?(?=\s)')
replacements.append('')

# [13] Removing text within parentheses
patterns.append('\((\w|\s)+\)')
replacements.append('')

# [14] Combining stray instances of '19' with the word 'covid' if it exists next to it
patterns.append('(covid|COVID)(\s|-)?19')
replacements.append('COVID19')

In [7]:
transcripts = df["video_transcript"].tolist()
cleaned = []
len(transcripts)

200

In [8]:
result = re.sub(patterns[0], replacements[0], transcripts[2])
result

" a latest covid-19 study is providing answers to one of the questions often repeated since the vaccines were introduced how have the vaccines been affecting our health now the largest vaccine study to date it has identified some risks which are associated with them and our next report brings you all the details take a look New Zealand's Global vaccine Data Network analyzed data from 199 million people who received covid-19 vaccines across eight countries they findings show the vaccines are responsible for a slight increase in neurological blood and heart Related [Music] Disorders the research says three doses of fiser bio entech and moderna's mRNA vaccines could Trigger miocarditis or rare condition of heart inflammation a third dose of Astra vaccine increased the risk of another the heart condition pericarditis 6.9 fold the first and fourth dose of Mna increased the risk 1.7 fold and 2.6 fold respectively those who took the astrena shots had a greater risk of developing a rare neurol

In [9]:
for transcript in transcripts:
    result = re.sub(patterns[0], replacements[0], str(transcript))
    
    for i in range(1, len(patterns)):
        result = re.sub(patterns[i], replacements[i], result)
    
    cleaned.append(result)
len(cleaned)

200

In [10]:
transcripts_df = pd.DataFrame(
    {
        'video_id': df["video_id"].tolist(),
        'video_title': df["video_title"].tolist(),
        'video_transcript': cleaned
    }
)
transcripts_df.head()

Unnamed: 0,video_id,video_title,video_transcript
0,2IXl4qJGrRk,A man deliberately got 217 Covid shots. Here’s...,A German man has puzzled scientists after he ...
1,HtTalpY-J-M,COVID: German man vaccinated 217 times had no ...,a 62-year-old German man from magur claims he...
2,jPs4_MeuX7U,New Covid vaccine study links jab to heart and...,a latest COVID19 study is providing answers t...
3,WhiBpmH1mE4,MAN GETS 217 COVID VACCINES! 😱😱😱 THIS is What ...,a 62-year-old man who lives in Germany uh got...
4,LfmhYVCCGhc,Joe Rogan says tons of people &quot;died sudde...,This is really painful to watch. On the bon...


In [11]:
from textblob import TextBlob

def textblob_subjectivity(df):
    subjectivity = []
    for entry in df["video_transcript"]:
        testimonial = TextBlob(entry)
        subjectivity.append(testimonial.sentiment.subjectivity)

    subjectivity = pd.Series(subjectivity, name="subjectivity")
    df = pd.concat([df, subjectivity], axis=1)
    return df

In [12]:
ss_df = textblob_subjectivity(transcripts_df)
ss_df

Unnamed: 0,video_id,video_title,video_transcript,subjectivity
0,2IXl4qJGrRk,A man deliberately got 217 Covid shots. Here’s...,A German man has puzzled scientists after he ...,0.411182
1,HtTalpY-J-M,COVID: German man vaccinated 217 times had no ...,a 62-year-old German man from magur claims he...,0.441667
2,jPs4_MeuX7U,New Covid vaccine study links jab to heart and...,a latest COVID19 study is providing answers t...,0.473893
3,WhiBpmH1mE4,MAN GETS 217 COVID VACCINES! 😱😱😱 THIS is What ...,a 62-year-old man who lives in Germany uh got...,0.490237
4,LfmhYVCCGhc,Joe Rogan says tons of people &quot;died sudde...,This is really painful to watch. On the bon...,0.535842
...,...,...,...,...
195,l6mTAJ08aUI,How the race for the Covid vaccine was won 💉 BBC,in different corners of the globe pioneering ...,0.502047
196,2XVCPdPAUbA,Law professor on whether employers can mandate...,K TRADING IN PERSON ON IN THE OPERATIONS WHE...,0.529917
197,XHfRpNJI0c8,Jennifer Aniston cuts ties over COVID vaccine ...,now to jennifer aniston who is revealing she'...,0.396068
198,wuqyRBmashA,NY woman says Johnson &amp; Johnson COVID vacc...,HERE. BUT IT'S NOT OUGHT OF HERE ESPECIALLY F...,0.574818
