# Gathering Data
## Imports

In [1]:
import pandas as pd
import requests
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import html
import json

# header used for each request
headers = {"User-agent": 'rsantayana'} 

## Reading One Transcript
### `ref_transcripts` Dataframe

In [2]:
url = r'https://www.rev.com/blog/transcripts/we-have-evidence-of-russian-war-crimes-says-ukraine-prosecutor-general-4-04-22-transcript'
headers = {"User-agent": 'rsantayana'} 
res = requests.get(url, headers=headers)
if res.status_code == 200:
    html = res.text

In [3]:
# parsing the result of the scrape to HTML
soup = BeautifulSoup(html, "html.parser")

In [4]:
# finding the tag and class that contains the transcribed speech
transcript = soup.find(name='div', attrs={'class': 'fl-callout-text-wrap'})
# html that contains the speech and person behind the speech
speaker_texts = [clob.text for clob in transcript.find_all('p')]
# set of speakers in the transcript
speakers = set([speaker_text.split('\n')[0].split(':')[0] for speaker_text in speaker_texts])
for speaker_text in speaker_texts:
    print(speaker_text.split('\n'))
    print()

['Speaker 1: (00:00)', 'Joining us now, Ukraine’s Prosecutor General Iryna Venediktova. If you could please tell us, we understand the atrocities are not confined to Bucha. What else can you report? What other cities and what other atrocities can you report to us?']

['Iryna Venediktova: (00:19)', 'Good morning, dear friends. At first, of course, I want to ask your citizens for this huge support during our very hard days. We started more than 4,000 criminal cases, only about war crimes. And these case are absolutely different by their size. From one case, we have concrete facts of murders of our civilians. In other case, for example, like our anchor case about Russian aggression, we have now 214 suspects, and this is top militaries and top politicians and peace from Russian Federation. We have evidence from absolutely different regions of our country about war crimes, about using prohibited weapons, about bombing civilian objects, about killing, and you see now, it’s in Kyiv region, ab

In [5]:
# getting the title of the transcript
transcript_title = soup.find('span', attrs={'class':'fl-heading-text'}).text
transcript_title

'‘We Have Evidence’ Of Russian War Crimes, Says Ukraine Prosecutor General 4/04/22 Transcript'

In [6]:
# getting the date transcript was posted
date_of_speech = soup.find('div', attrs={'class': 'fl-rich-text'}).text.strip('\n')
date_of_speech

'Apr 4, 2022'

In [7]:
# creating the dictionary that represents one record in the ref_transcripts dataframe
transcript_metadata = {'transcript_id': 0,
                      'transcript_title': transcript_title,
                       'speakers': speakers,
                       'date_of_speech': date_of_speech,
                       'url': url
                      }
pd.Series(transcript_metadata)

transcript_id                                                       0
transcript_title    ‘We Have Evidence’ Of Russian War Crimes, Says...
speakers            {Speaker 3, Speaker 4, Iryna Venediktova, Spea...
date_of_speech                                            Apr 4, 2022
url                 https://www.rev.com/blog/transcripts/we-have-e...
dtype: object

### `transcripts` Dataframe

In [8]:
for speaker_text in speaker_texts:
    print(speaker_text.split('\n'))
    print()

['Speaker 1: (00:00)', 'Joining us now, Ukraine’s Prosecutor General Iryna Venediktova. If you could please tell us, we understand the atrocities are not confined to Bucha. What else can you report? What other cities and what other atrocities can you report to us?']

['Iryna Venediktova: (00:19)', 'Good morning, dear friends. At first, of course, I want to ask your citizens for this huge support during our very hard days. We started more than 4,000 criminal cases, only about war crimes. And these case are absolutely different by their size. From one case, we have concrete facts of murders of our civilians. In other case, for example, like our anchor case about Russian aggression, we have now 214 suspects, and this is top militaries and top politicians and peace from Russian Federation. We have evidence from absolutely different regions of our country about war crimes, about using prohibited weapons, about bombing civilian objects, about killing, and you see now, it’s in Kyiv region, ab

In [9]:
# getting list of 
[speaker_text.split(':')[0] for speaker_text in speaker_texts]

['Speaker 1',
 'Iryna Venediktova',
 'Speaker 3',
 'Iryna Venediktova',
 'Speaker 4',
 'Iryna Venediktova',
 'Speaker 1']

In [10]:
[speaker_text.split(':')[2].split('\n')[1] for speaker_text in speaker_texts]

['Joining us now, Ukraine’s Prosecutor General Iryna Venediktova. If you could please tell us, we understand the atrocities are not confined to Bucha. What else can you report? What other cities and what other atrocities can you report to us?',
 'Good morning, dear friends. At first, of course, I want to ask your citizens for this huge support during our very hard days. We started more than 4,000 criminal cases, only about war crimes. And these case are absolutely different by their size. From one case, we have concrete facts of murders of our civilians. In other case, for example, like our anchor case about Russian aggression, we have now 214 suspects, and this is top militaries and top politicians and peace from Russian Federation. We have evidence from absolutely different regions of our country about war crimes, about using prohibited weapons, about bombing civilian objects, about killing, and you see now, it’s in Kyiv region, about just murders of our civilians',
 'At some point, 

In [11]:
transcripts = {
    'article_id': 0,
    'speaker': [speaker_text.split(':')[0] for speaker_text in speaker_texts],
    'speech': [speaker_text.split(':')[2].split('\n')[1] for speaker_text in speaker_texts]
}
pd.DataFrame(transcripts)

Unnamed: 0,article_id,speaker,speech
0,0,Speaker 1,"Joining us now, Ukraine’s Prosecutor General I..."
1,0,Iryna Venediktova,"Good morning, dear friends. At first, of cours..."
2,0,Speaker 3,"At some point, this war will be over. And when..."
3,0,Iryna Venediktova,"Of course, we understand who are responsible f..."
4,0,Speaker 4,"Ms. Venediktova, want to give you the official..."
5,0,Iryna Venediktova,I even can’t imagine that Ministry of Foreign ...
6,0,Speaker 1,All right. Ukraine’s Prosecutor General Iryna ...


In [12]:
pd.DataFrame(transcripts)

Unnamed: 0,article_id,speaker,speech
0,0,Speaker 1,"Joining us now, Ukraine’s Prosecutor General I..."
1,0,Iryna Venediktova,"Good morning, dear friends. At first, of cours..."
2,0,Speaker 3,"At some point, this war will be over. And when..."
3,0,Iryna Venediktova,"Of course, we understand who are responsible f..."
4,0,Speaker 4,"Ms. Venediktova, want to give you the official..."
5,0,Iryna Venediktova,I even can’t imagine that Ministry of Foreign ...
6,0,Speaker 1,All right. Ukraine’s Prosecutor General Iryna ...


## Getting all Rev Transcripts
We will scrape all Rev transcripts and then determine which ones are worth feeding into the model or not.

*__Notes:__*

https://sparkbyexamples.com/pandas/how-to-append-row-to-pandas-dataframe/

Appending rows into an existing dataframe

In [13]:
rev_category_urls = [r'https://www.rev.com/blog/transcript-category/2020-election-transcripts',
                     r'https://www.rev.com/blog/transcript-category/2022-elections',
                     r'https://www.rev.com/blog/transcript-category/congressional-testimony-hearing-transcripts',
                     r'https://www.rev.com/blog/transcript-category/debate-transcripts',
                     r'https://www.rev.com/blog/transcript-category/donald-trump-transcripts',
                     r'https://www.rev.com/blog/transcript-category/financial-transcripts',
                     r'https://www.rev.com/blog/transcript-category/interview-transcripts',
                     r'https://www.rev.com/blog/transcript-category/john-kirby',
                     r'https://www.rev.com/blog/transcript-category/ketanji-brown-jackson',
                     r'https://www.rev.com/blog/transcript-category/news-transcripts',
                     r'https://www.rev.com/blog/transcript-category/pentagon',
                     r'https://www.rev.com/blog/transcript-category/political-transcripts',
                     r'https://www.rev.com/blog/transcript-category/president-biden-transcripts',
                     r'https://www.rev.com/blog/transcript-category/white-house-correspondents-association',
                     r'https://www.rev.com/blog/transcript-category/white-house-briefing',
                     r'https://www.rev.com/blog/transcript-category/ukraine',
                     r'https://www.rev.com/blog/transcript-category/supreme-court',
                     r'https://www.rev.com/blog/transcript-category/state-department-briefing',
                     r'https://www.rev.com/blog/transcript-category/speech-transcripts',
                     r'https://www.rev.com/blog/transcript-category/science-transcripts',
                     r'https://www.rev.com/blog/transcript-category/russia-ukraine-conflict',
                     r'https://www.rev.com/blog/transcript-category/russia',
                     r'https://www.rev.com/blog/transcript-category/press-conference-transcripts',
                     r'https://www.rev.com/blog/transcript-category/press-briefing'
                 ]

# CONTINUE HERE

In [19]:
transcript_id = list()
transcript_title = list()
transcript_url = list()
speakers = list()
date_of_speech = list()

for rev_url in rev_category_urls:
    # try to successfully complete request for current category
    res = requests.get(rev_url, headers=headers)
    if res.status_code == 200:
        html = res.text
    # print which url is not receiving a 200 status code
    else:
        print(f'Unable to get a 200 status code from {rev_url}. Validate the data. Status code: {res.status_code}')
        print()
    soup = BeautifulSoup(html, "html.parser")
    # finding the tag and class that contains the transcribed speech
    transcript = soup.find(name='div', attrs={'class': 'fl-callout-text-wrap'})
    # html that contains the speech and person behind the speech
    speaker_texts = [clob.text for clob in transcript.find_all('p')]
    # set of speakers in the transcript
    speakers = list(set([speaker_text.split('\n')[0].split(':')[0] for speaker_text in speaker_texts]))

In [21]:
speaker_texts = [clob.text for clob in transcript.find_all('p')]
speaker_texts

['Speaker 1: (00:00)\nJoining us now, Ukraine’s Prosecutor General Iryna Venediktova. If you could please tell us, we understand the atrocities are not confined to Bucha. What else can you report? What other cities and what other atrocities can you report to us?',
 'Iryna Venediktova: (00:19)\nGood morning, dear friends. At first, of course, I want to ask your citizens for this huge support during our very hard days. We started more than 4,000 criminal cases, only about war crimes. And these case are absolutely different by their size. From one case, we have concrete facts of murders of our civilians. In other case, for example, like our anchor case about Russian aggression, we have now 214 suspects, and this is top militaries and top politicians and peace from Russian Federation. We have evidence from absolutely different regions of our country about war crimes, about using prohibited weapons, about bombing civilian objects, about killing, and you see now, it’s in Kyiv region, about j