In [1]:
#library to extract subtitles
%pip install youtube_transcript_api

Collecting youtube_transcript_api
  Downloading https://files.pythonhosted.org/packages/21/81/c4ae5534b113f4938b482f360babbbe6fda550441a4af8e1007dba518586/youtube_transcript_api-0.3.1-py3-none-any.whl
Installing collected packages: youtube-transcript-api
Successfully installed youtube-transcript-api-0.3.1
Note: you may need to restart the kernel to use updated packages.


In [164]:
#code needed to use colab
#if you're using juptyer notebook then skip this line
from google.colab import drive
drive.mount('/content/gdrive')

ModuleNotFoundError: No module named 'google.colab'

In [3]:
#cd used in colab
#%cd "gdrive/My Drive/Colab Notebooks"
%cd ..
!ls

/Users/Rolf/Documents/GitHub/esade_fake_news/4_Politics/python3_script/data/nlp
20200504-193926_joe_biden_bias.csv
20200504-193926_joe_biden_nlp.csv
20200504-193926_joe_biden_nlp_subs.csv
20200504-193926_joe_biden_nlp_subs_clean.csv
20200504-193926_joe_biden_nlp_subs_clean_expanded.csv
[34mnotebooks[m[m
[34mresults[m[m
trigrams_joe_biden.png


In [2]:
import pandas as pd
import numpy as np
from youtube_transcript_api import YouTubeTranscriptApi
from tqdm import tqdm

# Extract Subtitles

In [21]:
#import csv
df = pd.read_csv('20200504-193926_joe_biden_nlp.csv', index_col=0)
df.head(2)

Unnamed: 0,Bias_num,title,description,channel,id
1,0,Elon Musk says Australia’s energy emergency is...,It’s been a miserable few weeks for Malcolm Tu...,60 Minutes Australia,w7BMaG3zyVo
2,0,Real life Catch Me If You Can con artist revea...,Catch Me If You Can stars Leonardo DiCaprio as...,60 Minutes Australia,3UmcxQto7UU


In [8]:
#length of dataset
print(f'Length of dataset is: {len(df)}')

#percentage of videos that are right
pr_right = round(df['Bias_num'].mean()*100,2)
print(f'Percentage of videos that are right: {pr_right} %')

Length of dataset is: 849
Percentage of videos that are right: 32.51 %


In [None]:
#loop to extract subtitles

tttt = []

with tqdm(total=len(df['id'])) as pbar:  
    for idx in df['id']:
        try:
            t = YouTubeTranscriptApi.get_transcript(idx)
        except:
            tttt.append([idx, np.nan]) #add NaN to videos that do not have subtitles
            continue
        tt = [i["text"] for i in t]
        ttt = [' '.join(tt)]
        tttt.append([idx, ttt])
        pbar.update(1)

df_sub = pd.DataFrame(tttt, columns=['video_id', 'subtitles'])
df_sub.head()

In [None]:
#merge subtitles with dataframe containing video id, channel, etc. 
df2 = df.merge(df_sub, left_on='id', right_on='video_id')

In [None]:
#i checked a few nan subtitles and they don't have subtitles. 
df2[df2['subtitles'].isna()].head()

In [None]:
#drop column
df2.drop('video_id', inplace=True, axis=1)

In [None]:
#export csv
df2.to_csv("20200504-193926_joe_biden_nlp_subs.csv")

# Subtitles cleaning

In [4]:
# load train_cleaned.csv
filename = "20200504-193926_joe_biden_nlp_subs.csv"
df = pd.read_csv(filename, usecols=['Bias_num', 'subtitles','channel'])

#copy to not destroy original dataset
df_channel = df.copy()

#create new column text. easier in the analysis
df['text'] = df['subtitles']

#drop subtitles and channel
df.drop(['subtitles'], axis=1, inplace=True)

In [5]:
#which videos have NaN as subtitle
df_nan = df[df['text'].isna()]

#number of videos with no subtitles
num_nan = len(df_nan)
print(f'Number of videos that do NOT have subtitles: {num_nan}')

print()

#displays channels with missing subtitles
channel = df_nan.groupby(["channel"])['Bias_num'].count().sort_values(ascending=False)
print(channel[:5])

print()

#display distribution of bias with missing subtitles
bias = df_nan.groupby('Bias_num')['channel'].count()
print(bias)

Number of videos that do NOT have subtitles: 112

channel
CNN               15
Reagan Library    14
PBS NewsHour      12
The Hill           7
CBS News           7
Name: Bias_num, dtype: int64

Bias_num
0    78
1    34
Name: channel, dtype: int64


In [6]:
#drop channel
df.drop(['channel'], axis=1, inplace=True)

#drop NaNs
df.dropna(inplace=True)

#remove brackets [] and quotations marks
df['text'] = df['text'].str[2:-2]

df.head(2)

Unnamed: 0,Bias_num,text
0,0,who hasn't been shocked by a recent electricit...
1,0,you're one of the greatest con men of all time...


In [7]:
#average right wing videos
df['Bias_num'].mean()

0.32570659488559894

In [8]:
#split channel string to remove each element from subtitles

#remove (in English) since the brackets cause problems with replacing values
df_channel['channel'] = df_channel['channel'].replace("(in English)", '', regex=True)

#list all unique channels
unique_channel = df_channel['channel'].unique()

#split channel name by whitespace
unique_channel_split = [i.split(' ') for i in unique_channel]

#display first row
unique_channel_split[0]

['60', 'Minutes', 'Australia']

In [9]:
#loop to remove each single word in channel name
#doing this to remove any possibility that the NLP model learns channel to predict bias
for channel in unique_channel_split:
    for element in channel:
        df['text'].replace(element,
                           '',
                           inplace=True,
                           regex=True)

In [12]:
#the text [Music] appears in subtitles when music is playing. 
#these values mess up the analysis in the end with 3-gram, 2-gram
to_replace = ['Music', 'music', 'Applause', 'applause', 'Laughter', 'laughter', '♪', "'", "\n", "\\\92", "\\\96", 
              "\\\\", 'AUDIENCE LAUGHING', 'AUDIENCE APPLAUDING', 'AUDIENCE GASPS', "NARRATOR"]

df['text'].replace(to_replace, '', inplace=True, regex=True)  

In [13]:
df.to_csv("20200504-193926_joe_biden_nlp_subs_clean.csv")

## Split subtitles after 1'000 characters

We split at 1'000 characters to make our dataset larger. I (Rolf) thinks that having more rows is more important for the model training than few but long rows. I think that 1'000 characters is a good cut-off. This is an arbitrary number. 

In [14]:
# load train_cleaned.csv
filename = "20200504-193926_joe_biden_nlp_subs_clean.csv"
df_clean = pd.read_csv(filename, index_col=0)

df_clean.head()

Unnamed: 0,Bias_num,text
0,0,who hasnt been shocked by a recent electricity...
1,0,youre one of the greatest con men of all time ...
2,0,if you thought the insults hold between North ...
3,0,bravery courage defiance heartbreak theyre not...
5,0,Jeffrey Epstein was a billionaire businessma...


In [15]:
#df_clean has this many rows

print('df_clean has this many rows: ' + str(len(df_clean)))

df_clean has this many rows: 743


In [16]:
#function to split text into columns after 1'000 characters
def chunks(s, n):
    """Produce `n`-character chunks from `s`."""
    for start in range(0, len(s), n):
        yield s[start:start+n]

In [17]:
#empty lists to keep info
sub = []
bias = []

#loop to split at 1'000 characters
for index, row in df.iterrows():
    for chunk in chunks(row['text'], 1000):
        sub.append(chunk)
        bias.append(row['Bias_num'])

In [18]:
#create a dataframe from sub and bias list
df_sub = pd.DataFrame(list(zip(bias, sub)), columns=['bias_num', 'text'])

length = len(df_sub)
print(f"Number of rows after splitting at 1'000 characters: {length}")

percentage = round(df_sub['bias_num'].mean()*100, 2)
print(f"Percentage of rows that are right wing: {percentage}%")

df_sub.head()

Number of rows after splitting at 1'000 characters: 18179
Percentage of rows that are right wing: 42.09%


Unnamed: 0,bias_num,text
0,0,who hasnt been shocked by a recent electricity...
1,0,ave it denote when it gets dark [] but keeping...
2,0,e energy its Rayleigh you could actually expor...
3,0,eted headfirst into a national political brawl...
4,0,energy he quickly realized there were plenty ...


In [19]:
#loop to calculate average length of text per row

length = []

for index, row in df_sub.iterrows():
    length.append(len(row['text']))
    
avg_length = round(sum(length)/len(length),2)
print(f"Average character length of text column per row: {avg_length}")

Average character length of text column per row: 979.15


In [20]:
df_sub.to_csv("20200504-193926_joe_biden_nlp_subs_clean_expanded.csv")