In [1]:
import requests
import json
from copy import deepcopy
import pandas as pd
from bs4 import BeautifulSoup

## Helpers

In [2]:
def get_query_jsn(url):
    """returns json from request
    -INPUT:
        -url: str of url to be requested
    -OUTPUT:
        - list of päevakorrad"""
    return json.loads(requests.get(url).content)

def get_years_paevakorrad(years):
    """collects all päevakord from each year
    -years: list of integeres"""
    paevakorrad=[]
    for year in years:
        print(f'working on year {year}')
        try:
            paevakorrad_year=get_query_jsn(f'https://api.riigikogu.ee/api/agenda/plenary?endDate={year}-12-31&lang=et&querySteno=true&startDate={year}-01-01')
            paevakorrad.append(paevakorrad_year)
        except Exception as e:
            print(f'exception on year {year}, {e}')
            paevakorrad_year_first_half=get_query_jsn(f'https://api.riigikogu.ee/api/agenda/plenary?endDate={year}-06-01&lang=et&querySteno=true&startDate={year}-01-01')
            paevakorrad.append(paevakorrad_year_first_half)
            paevakorrad_year_second_half=get_query_jsn(f'https://api.riigikogu.ee/api/agenda/plenary?endDate={year}-12-31&lang=et&querySteno=true&startDate={year}-06-02')
            paevakorrad.append(paevakorrad_year_second_half)
    return paevakorrad

In [3]:
def paevakorrad_stenos_count(paevakorrad):
    """counts number of stenos 
    -INPUT:
        - paevakorrad: list of päevakorrad
    -OUTPUT:
        -pandas DataFrame with päevakorrad and stenos count info"""
    paevakorrad_stenos_count={}
    for year in paevakorrad:
        paevakorrad_stenos_count[year.get('weekStartDate')]={'agendaItems':0,
                                                            'have_stenos':0,
                                                            'stenos_links':[],
                                                            'stenos_n_unique_links':0}
        sittings=year.get('sittings', {})
        for sitting in sittings:
            agenda_items=sitting.get('agendaItems', {})
            for agenda_item in agenda_items:
                link=agenda_item.get('_links',{}).get('steno',{}).get('href', {})
                paevakorrad_stenos_count[year.get('weekStartDate', )]['agendaItems']+=1
                if link!={}:
                    paevakorrad_stenos_count[year.get('weekStartDate')]['have_stenos']+=1
                    paevakorrad_stenos_count[year.get('weekStartDate')]['stenos_links'].append(link.split('#')[0])
        paevakorrad_stenos_count[year.get('weekStartDate')]['stenos_n_unique_links']=len(set(paevakorrad_stenos_count[year.get('weekStartDate')]['stenos_links']))
    return pd.DataFrame(paevakorrad_stenos_count).T

In [4]:
def get_main_talk(soup, conversations=[], steno_link=None):
    """scrape content of stenogram
    -INPUT:
        - soup: beautifulsoup object of stenogram
        - conversations: list where to store stenograms
        - steno_link: str of url to steno
    -OUTPUT:
        lis of conversations (previous + currently parsed)"""
    for i, tag in enumerate(soup.find('article').next_siblings):
        if tag.name=='article':
            heading=tag.find('h3').text.strip()
            speech_areas=tag.findAll("div", {"class": "pb-4 speech-area"})
            for speech_area in speech_areas:
                link=speech_area.find('a', {'class':'steno-video-icon'})
                if link is not None:
                    link=link['href']
                else:
                    link=None

                speaker=speech_area.find('h4').text.strip()
                p_tags=speech_area.find_all('p')
                if p_tags is not None:
                    text=' '.join([p.get_text(separator=" ").strip() for p in p_tags])
                else:
                    text=None
                ind=tag['id']
                ind_snd=speech_area['id']
                conversation={'heading':heading,
                              'speaker':speaker,
                              'link_video':link,
                              'link_steno':steno_link,
                              'index_pk':ind,
                              'index_snd':ind_snd,
                             'text':text}
                conversations.append(conversation)
    return conversations

In [5]:
def get_agenda(soup, steno_link=None):
    """scrape steno agenda from soup
    -INPUT:
         - soup: beautifulsoup object of stenogram
         - steno_link: str of url to steno
    -OUTPUT:
        -list of conversation agenda information"""
    conversations=[]
    header=None
    text=None
    url=None
    for i, tag in enumerate(soup.find('article', {"class": "steno-agenda-item"})):
        if tag.name=='h3':
            heading=tag.text.strip()
        if tag.name=='div' and tag.attrs['class']!=['d-flex']:
            speaker=tag.find('h4').text.strip()
            p_tag=tag.find('p')
            if p_tag is not None:
                text=p_tag.get_text(separator=" ").strip()
            else:
                text=None
            href_tag=tag.find('a', {'class':'steno-video-icon'})
            if href_tag is not None:
                url=href_tag['href']
            else:
                url=None
            ind=tag.parent['id']
            ind_snd=tag['id']
            conversation={'heading':heading,
                              'speaker':speaker,
                              'link_video':url,
                              'link_steno':steno_link,
                              'index_pk':ind,
                              'index_snd':ind_snd,
                             'text':text}
            conversations.append(conversation)
            
    return conversations

In [6]:
def parse_steno(soup, steno_link):
    """wrapper function to get steno content and agenda
    -INPUT:
        - soup: beautifulsoup object of stenogram
         - steno_link: str of url to steno
    -OUTPUT:
        - list of conversations info (steno content + agenda)"""
    conversations=get_agenda(soup, steno_link)
    conversations=get_main_talk(soup, conversations, steno_link)
    return conversations

In [7]:
def get_links_stenos(links):
    """function to get all steno content from links
    -INPUT:
        - links: list of links to stenos
    -OUTPUT:
        -dict of {steno_link:steno_content}"""
    stenos={}
    for i, link in enumerate(links):
        if i%20==0:
            print(f'working on {i}')
        steno_soup=BeautifulSoup(requests.get(link).content)
        talk=[]
        try:
            talk=parse_steno(steno_soup, link)
        except Exception as e:
            print(e)
            print(link)
        stenos[link]={'talk':talk}
    return stenos

## Collect päevakorrad which have links to stenos

In [8]:
%time paevakorrad2022=get_years_paevakorrad(range(2022, 2024))

working on year 2022
working on year 2023
CPU times: total: 31.2 ms
Wall time: 1min 9s


In [15]:
%time paevakorrad=get_years_paevakorrad(range(2010, 2023))

working on year 2010
working on year 2011
working on year 2012
working on year 2013
working on year 2014
working on year 2015
working on year 2016
working on year 2017
working on year 2018
working on year 2019
working on year 2020
working on year 2021
working on year 2022
Wall time: 8min 28s


In [16]:
len(paevakorrad)

13

In [9]:
len(paevakorrad2022)

2

## How many stenos do we have for each year

In [11]:
df_stenos_links2022=paevakorrad_stenos_count(paevakorrad2022)
df_stenos_links2022

Unnamed: 0,agendaItems,have_stenos,stenos_links,stenos_n_unique_links
2022-01-01,945,552,[https://stenogrammid.riigikogu.ee/20220110150...,146
2023-01-01,600,295,[https://stenogrammid.riigikogu.ee/20230109150...,65


In [19]:
df_stenos_links=paevakorrad_stenos_count(paevakorrad)

In [20]:
df_stenos_links

Unnamed: 0,agendaItems,have_stenos,stenos_links,stenos_n_unique_links
2010-01-01,945,144,[https://stenogrammid.riigikogu.ee/20100111150...,144
2011-01-01,743,118,[https://stenogrammid.riigikogu.ee/20110110150...,118
2012-01-01,705,99,[https://stenogrammid.riigikogu.ee/20120111130...,99
2013-01-01,649,85,[https://stenogrammid.riigikogu.ee/20130115100...,85
2014-01-01,700,121,[https://stenogrammid.riigikogu.ee/20140113150...,121
2015-01-01,718,107,[https://stenogrammid.riigikogu.ee/20150112150...,107
2016-01-01,860,144,[https://stenogrammid.riigikogu.ee/20160111150...,144
2017-01-01,842,145,[https://stenogrammid.riigikogu.ee/20170109150...,145
2018-01-01,760,141,[https://stenogrammid.riigikogu.ee/20180108150...,141
2019-01-01,663,102,[https://stenogrammid.riigikogu.ee/20190114150...,102


## Get all unique memos

In [21]:
unique_steno_links=list(set([item.replace('hans-frontend.riigikogu.ee', 'stenogrammid.riigikogu.ee')
                             for sublist in df_stenos_links.stenos_links.to_list() for item in sublist]))
len(unique_steno_links)

1493

In [12]:
unique_steno_links2022=list(set([item.replace('hans-frontend.riigikogu.ee', 'stenogrammid.riigikogu.ee')
                             for sublist in df_stenos_links2022.stenos_links.to_list() for item in sublist]))
len(unique_steno_links2022)

211

In [22]:
%time stenos1=get_links_stenos(unique_steno_links[:900])

working on 0
working on 20
working on 40
working on 60
working on 80
working on 100
working on 120
working on 140
working on 160
working on 180
working on 200
working on 220
working on 240
working on 260
working on 280
working on 300
working on 320
working on 340
working on 360
working on 380
working on 400
working on 420
working on 440
working on 460
working on 480
working on 500
working on 520
working on 540
working on 560
working on 580
working on 600
working on 620
working on 640
working on 660
working on 680
working on 700
working on 720
working on 740
working on 760
working on 780
working on 800
working on 820
working on 840
working on 860
working on 880
Wall time: 12min 40s


In [23]:
%time stenos2=get_links_stenos(unique_steno_links[900:])

working on 0
working on 20
working on 40
working on 60
working on 80
working on 100
working on 120
working on 140
working on 160
working on 180
working on 200
working on 220
working on 240
working on 260
working on 280
working on 300
working on 320
working on 340
working on 360
working on 380
working on 400
working on 420
working on 440
working on 460
working on 480
working on 500
working on 520
working on 540
working on 560
working on 580
Wall time: 8min 33s


In [24]:
stenos1.update(stenos2)

In [25]:
len(stenos1)

1493

In [13]:
%time stenos2022=get_links_stenos(unique_steno_links2022)

working on 0
working on 20
working on 40
working on 60
working on 80
working on 100
working on 120
working on 140
working on 160
working on 180
working on 200
CPU times: total: 11.3 s
Wall time: 3min 55s


## Save

In [27]:
with open('data/raw/stenos_from_links.json', 'w') as f:
    json.dump(stenos1, f)

In [14]:
with open('data/raw/stenos_from_links2022.json', 'w') as f:
    json.dump(stenos2022, f)