In [28]:
from bs4 import BeautifulSoup
import unicodedata
import requests
import re
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy import Column
from sqlalchemy import String
from sqlalchemy import Integer
from sqlalchemy import MetaData

In [30]:
#init connection to sqlite 
engine = create_engine("sqlite:///test_db.db")
#create session to cache commands for sqlite engine instance
Session = sessionmaker(bind = engine)
session = Session()

In [None]:
#provide table definition
Base = declarative_base()

class User(Base):
    __tablename__ = 'user'
    id = Column('id',Integer, primary_key = True)
    title = Column(String(100))
    date = Column(String(50))
    details = Column(String(1000))

    def __init__(self, title, date, details):
        self.title = title
        self.date = date
        self.details = details
        
    #for print    
    def __repr__(self):
        return f'{self.title} - {self.date} - {self.details}'



In [None]:
#call to metadata to generate schema
Base.metadata.create_all(engine)

In [None]:
page = requests.get('https://www1.nyc.gov/site/manhattancb1/meetings/committee-agendas.page')

In [None]:
soup = BeautifulSoup(page.text,'html.parser')

In [None]:
paragraphs = soup.find(class_='about-description').find_all('p')

In [None]:
events = []
event_num = None
event = []

for par in paragraphs:
    #ends event and starts a new event
    if event_num is not None and re.search(r"^(\d*[/]\d*)",par.text):
        events.append(event)
        event = []
        event_num += 1

    #finds intial event and starts the process
    if event_num is None and re.search(r"^(\d*[/]\d*)",par.text):
        event = []
        event_num = 0

    event.append(par)


In [None]:
events_dict = {}

for i, event in enumerate(events):
    event_string = ''.join([tag.text for tag in event])
    #print(event_string, '\n')
    date = re.search(r"^(\d*[/]\d*)",event_string).group(0)
    title = event[0].find('b').text
    details = ''.join([tag.text for tag in event[1:]])
    details = details.strip().replace('\xa0', '')
    time = re.search(r"((1[0-2]|0?\d):(\d\d) ([AaPp][Mm]))",event_string).group(0)    

    events_dict[i] = {
        'date': date,
        'time':time,
        'title': title,
        'details':details
    }
    

In [None]:
#events_dict

In [None]:
for event in events_dict.values():
    row = User(title=event['title'], date=event['date'], details=event['details'])
    session.add(row)
session.commit()

In [None]:
#print all users
for user in session.query(User):
    print(user)