# Setup

In [None]:
# Scripts are written for Python 3.9.6
%pip install -r requirements.txt

## please also:
'''
1. Change the Request-Header main/src/hys_portal_scraper.py#L27 to your own E-Mail Address
2. Make sure you set a wait-time above 10 sec
3. Create a MySQL-Database and specify its configs in: main/src/database/database_connection.py

'''

In [1]:
# Setup
from src.hys_portal_scraper import Portal_Scraper
from src.scrapers.initiative_scraper import Initiative_Scraper
from src.scrapers.feedback_scraper import Feedback_Scraper
from src.scrapers.attachment_scraper import Attachment_Scraper

from src.database.seedlist_handler import SeedList_Handler

from src.database.database_connection import database_connection
%load_ext autoreload
%autoreload 2



# Initalize Database
Database structure is defined in src/database/database_connection.py

In [None]:
# Only run once. Code creates new Database.
con = database_connection()
Portal_Scraper(con).init_database_session(create_db=True)

# Upsert Seedlist to Database
Seedlist is a .txt document with one URL per line. The URL points to the initatives homepage, like <br> "https://ec.europa.eu/info/law/better-regulation/have-your-say/initiatives/1362-Access-to-Social-Protection_en"

In [None]:
# create Table named Seedlist and fill with initiative_id and more.
con = database_connection()
SeedList_Handler(connection=con).insert_seedlist("data/seedlist_manual_all_140424_additional-Inis.txt")

# Scrape Initiatives and Stage Metadata
Input are all Initative IDs that, according to seedlist are not scraped yet.

In [None]:
'''
1. Scrape all initatives where `initiative_updated` is Null in Seedlist Table
2. Upsert initative metadata in Initatives Table
3. Upsert stage metadata in Stages Table
4. Update 'initiative_updated' in Seedlist with current (GMT) time
'''

con = database_connection()

Initiative_Scraper(connection=con, wait_time=10).scrape_all() # wait_time = time between requests

In [None]:
## display stages in DB by Published Date
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
from src.database.database_connection import database_connection

engine = create_engine(database_connection(), echo=False, echo_pool=False)

stage_dates_published = pd.read_sql("SELECT published_date FROM stages", engine)

stage_dates_published = stage_dates_published.value_counts("published_date")
stage_dates_published = stage_dates_published.sort_index(ascending=True)

stage_dates_published = stage_dates_published.resample('M').agg("count")

plt.plot(stage_dates_published.index, stage_dates_published.values)
plt.xticks(rotation='vertical')
plt.show()

# Scrape Feedbacks and Attachment-Metadata

In [None]:
'''
1. Scrape all Stages where `feedback_updated` is Null in Stages Table
2. Upsert feedbacks to Feedbacks Table
3. Upsert attachment metadata to Attachments Table
4. Update 'feedback_updated' in Stages Table with current time
'''

con = database_connection()
Feedback_Scraper(connection=con, wait_time=10).scrape_all() # wait_time = time between requests

### Scrape Specific Stages

In [2]:
con = database_connection()
stage_id_queue = [
    31234550, #done
    7929317,
    26519622, 
    32232670, 
    25987338, 
    32438558]

Feedback_Scraper(connection=con, wait_time=10, stage_id="7929317").scrape_feedback() # wait_time = time between requests

logging     : INFO     gmtime = 2024-06-21 18:59:31
logging     : INFO     Wait Time set to 10 sec.
logging     : INFO     Scraping Feedback of 81 Stages ...

logging     : INFO     Scraping Stage: 26519622
logging     : INFO     ETA of Stage-Data 19:41:40
logging     : INFO     Scraped Page 481/7090
logging     : INFO     Scraped Page 482/7090
logging     : INFO     Scraped Page 483/7090
logging     : INFO     Scraped Page 484/7090
logging     : INFO     Scraped Page 485/7090
logging     : INFO     Scraped Page 486/7090
logging     : INFO     Scraped Page 487/7090
logging     : INFO     Scraped Page 488/7090
logging     : INFO     Scraped Page 489/7090
logging     : INFO     Scraped Page 490/7090
logging     : INFO     Scraped Page 491/7090
logging     : INFO     Scraped Page 492/7090
logging     : INFO     Scraped Page 493/7090
logging     : INFO     Scraped Page 494/7090
logging     : INFO     Scraped Page 495/7090
logging     : INFO     Scraped Page 496/7090
logging     : INFO     

AttributeError: 'tuple' object has no attribute 'tb_frame'

# Scrape Attachments

In [3]:
con = database_connection()
Attachment_Scraper(con, 5).scrape_all_of_stage(stage_id="31234550")

# 090166e5007b5a73 ist ein engescanntes Dokument und lies sich nicht scrapen...

NameError: name 'con' is not defined

In [None]:
con = database_connection()
#Attachment_Scraper(connection=con, wait_time=10, document_id=).scrape_attachments

Attachment_Scraper(con, 5).scrape_all()