# Gathering Data

In [8]:
import pandas as pd
import requests
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup as bs
import html
import json

In [2]:
url = r'https://www.rev.com/blog/transcripts/we-have-evidence-of-russian-war-crimes-says-ukraine-prosecutor-general-4-04-22-transcript'
headers = {"User-agent": 'rsantayana'} 

res = requests.get(url, headers=headers)
if res.status_code == 200:
    subreddit_json = res.raw
    print(subreddit_json)

<urllib3.response.HTTPResponse object at 0x7fb7391ee430>


### EXAMPLE: Scrape Kremlin website article transcript

In [44]:
url = r'http://www.en.kremlin.ru/events/president/transcripts/interviews/67100'
headers = {"User-agent": "rdata"}

def parse_html(string):
    return string.replace(u'\xa0', u' ').replace(u'\n', u' ')

try:
    res = requests.get(url, headers=headers)
    if res.status_code == 200:
        transcript_html = res.text
    else: 
        raise Exception(res.status_code)
    
    soup = bs(transcript_html, "html")
    entry_content = soup.find("div", class_="entry-content")
    entry_paragraphs = entry_content.find_all("p", recursive=False)
    
    statements = []
    last_speaker = ""
    for p in entry_paragraphs:
        if p is not None:
            speaker_tag = p.find("b")
            if speaker_tag is None:
                speaker = last_speaker
            else:
                speaker = parse_html(speaker_tag.text)

            if p.text:
                if len(p.text.split(": ")) >= 2:
                    statement = parse_html(p.text.split(": ")[1])
                else:
                    statement = parse_html(p.text)
            last_speaker = speaker
        print([speaker, statement])
        
    # TODO: Find a way to retrieve publication date for interview transcript. This is the closest we can get
    # to the date the interview took place. If we measure 'tension' level based on the transcript's proximity
    # to an attack both past and future, a margin of error of a couple days to a week shouldn't be significant

except Exception as e:
    print(e)

['Pavel Zarubin', 'Mr President, tensions are escalating on the border between the EU and Belarus. The European Union has already deployed army troops. Moreover, they are building up the contingent. You spoke with Angela Merkel twice, and you also spoke with Alexander Lukashenko. Why they do not speak to each other directly is probably also a question I would like to ask you. In general, what do you think of the developments there? ']
['President of Russia Vladimir Putin', 'To begin with, why they do not speak with each other is not a question for me. We have nothing to do with that. But I inferred from my conversations with President Lukashenko and Chancellor Merkel that they are ready to speak with one another. I hope it will happen soon and some sort of direct contact will be established between the European Union, the EU leading nations, or at least between the Federal Republic of Germany and Belarus. This is crucial because the migrants’ goal is primarily to get into Germany. ']
[

### EXAMPLE: Scrape all kremin website transcript articles appearing on the front transcripts page

In [3]:
url = r'http://www.en.kremlin.ru/events/president/transcripts/interviews'
headers = {"User-agent": "rdata"}

def parse_html(string):
    return string.replace(u'\xa0', u' ').replace(u'\n', u' ')

try:
    # Get list of interviews on the front page
    # We can infer the list of links to articles with the data-id attribute of the div tags
    # inside of div.entry-content 
    res = requests.get(url, headers=headers)
    if res.status_code == 200:
        front_page_html = res.text
        
    front_page_soup = bs(front_page_html, "html")
    articles_list_soup = front_page_soup.find("div", class_="entry-content")
    print(articles_list_soup.find_all("div"))
    
except Exception as e:
    print(e)

[<div class="hentry h-entry hentry_event" data-id="70350" itemscope="itemscope" itemtype="http://schema.org/NewsArticle" role="listitem"> <h3 class="hentry__title hentry__title_special"> <a href="/events/president/transcripts/interviews/70350" itemprop="url" rel="bookmark"><span class="entry-title p-name" itemprop="name">Comment for Rossiya TV channel</span> <span class="hentry__meta"> <time class="published dt-published" datetime="2023-01-13" itemprop="datePublished">January 13, 2023, 14:35</time> <span class="updated hidden" hidden="hidden">2023-02-16</span> <span class="author vcard hidden" hidden="hidden"><span class="fn">Team of the Official Website of the President of Russia</span></span> <span class="hentry__location p-location">Ufa</span> </span> </a> </h3> <div class="hentry__assets"> <a aria-hidden="true" aria-label="Text of the article" class="tabs_article item medium" href="/events/president/transcripts/interviews/70350" rel="bookmark" title="Text of the article"><i></i> </

### EXAMPLE: Scrape American Presidency Project website for all US Presidential interviews

In [11]:
# The website url is for the first page in the list of all US presidential interviews. 
# We can increment the page query param until failure to get links to all interviews.
url = "https://www.presidency.ucsb.edu/documents/app-categories/presidential/interviews?page=0"
headers = {"User-agent": "rdata"}

def parse_html(string):
    return string.replace(u'\xa0', u' ').replace(u'\n', u' ')

try:
    # Load page
    res = requests.get(url, headers=headers)
    if res.status_code == 200:
        html = res.text
    
    soup = bs(html, "html")
    list_soup = soup.find_all("div", class_="views-row")
    
    # Get links to articles from list
    links = []
    for row in list_soup:
        title = soup.find("div", class_="field-title")
        link = soup.find("a")['href']
        links.append(link)
        
    print(links)
    
except Exception as e:
    print(e)


['#main-content', '#main-content', '#main-content', '#main-content', '#main-content', '#main-content', '#main-content', '#main-content', '#main-content', '#main-content']
