In [None]:
import pandas as pd  ## Dataframe manipulation
import requests
from requests_html import HTMLSession ## We dont have to manipulate the webpage at all, so a chrome based scraper is not needed
from bs4 import BeautifulSoup as bs ## how we read the info from the request
import csv ## how we will store the data portably
import re ## this is for helping find the bill text

In [None]:
def searchDF(df, column, query):
    searchDF = df[df[column].apply(str).str.contains(query, na=False)]

    return searchDF


In [None]:
## check for datafile in local folder, or create it
fileName = 'mnLaws.csv'
try:
    dataframe = pd.read_csv(fileName)
    dataframe.head()
    
    ##TODO ContinueScrape()  ####   Demo is set up as if starting from 0 #####
except :
    with open(fileName, 'w', newline='') as csvfile:
        csvWriter = csv.writer(csvfile, delimiter=' ')
        header = ['LegislatureName', 'LegislatureUrl', 'sessionYear', 'sessionType',
       'sessionUrl', 'chapter', 'chapterUrl', 'bill', 'billUrl', 'text',
       'PresentmentDate', 'scrapeComplete']
        ## The DOM breaks it down into bill sections and subdivisions,
        ##but we just need the text for todays project
        csvWriter.writerow(header)
        
        dataframe = pd.read_csv(file)
dataframe.head()

In [None]:
htmlRequester = HTMLSession()
# r = requests.get('https://www.revisor.mn.gov/laws/')  ## if just using requests, not requestsHTML, maybe we can test the speed
r = htmlRequester.get('https://www.revisor.mn.gov/laws/')  ## or efficency of these three methods (requests, html, chromium)
print(r.status_code, r.encoding)
soup = bs(r.text, 'html.parser')

In [None]:
## Exploring the data a bit

In [None]:
soup.title

In [None]:
legislatures = soup.find_all('tr','alternate')  ## Using the inspect tool here to find the identifier for the data we want
legislatures[0]

## TODO the first row in the table does not have the alternate tag

In [None]:
legislatures[0].find('a', href=True)['href']  

In [None]:
legislatures[0].find_all('td')[1].text.strip('\n ').rstrip('\n ')

In [None]:
sessionTracker = pd.DataFrame(data=None, index=None, columns= ['LegislatureName', 'LegislatureUrl'])
## using a df to track my progress on scraping, for a more polished project we should use another program to start these scrapes,
## so we could use more computing power / threads, and for resiliencey of the scraper to unexpected page issues
print(sessionTracker)


yearlysessionTracker = pd.DataFrame(data=None, index=None, columns= ['LegislatureName', 'sessionYear', 'sessionType', 'sessionUrl'])
print(yearlysessionTracker)

In [None]:
## COLLECTING LIST OF LEGISLATURES  (I didnt turn these into functions because each page is too different)

for session in legislatures:
    name = session.find_all('td')[1].text.strip('\n ').rstrip('\n ')
    url = session.find('a', href=True)['href'].strip('//')
    sessionTracker.loc[len(sessionTracker.index)] = [name, url]
sessionTracker.head()
        ## TODO We are missing the 92nd legislature, it has a different layout, should manualy add to 
      ## session tracker or try to programaticly fix (only current year has this layout)? answer depends on use
      ## if i was trying to just fill the data lake, instead of just analyzing bill text, more time could
      ## be spent fixing?
newRow = pd.DataFrame({'LegislatureName':'92nd Legislature', 'LegislatureUrl':'www.revisor.mn.gov/laws/92.0'},index=[0])
sessionTracker = pd.concat([newRow, sessionTracker]).reset_index(drop = True)
sessionTracker.head()

In [None]:
# Collecting List of Sessions in a Legislature

iterator = 0         ## to limit requests here replace <5 with < len(sessionTracker.index) for complete scrape
while iterator < 15:
# while iterator < len(sessionTracker.index):
    sessionName = sessionTracker.loc[iterator]['LegislatureName']
    url = sessionTracker.loc[iterator]['LegislatureUrl']

    r = htmlRequester.get('http://' + url)
    soup = bs(r.text, 'html.parser')
    sessions = soup.find_all('p', 'p_session')
    for session in sessions:
        yearlySession = session.find_all('a')[0].text.replace('\n', '').strip(' ').rstrip(' ')

        sessionYear = yearlySession[0:4]
        sessionType = yearlySession[6:].strip(' ')
        sessionUrl = sessions[0].find_all('a', href=True)[0]['href'].strip('//')
        yearlysessionTracker.loc[len(yearlysessionTracker.index)] = [sessionName, sessionYear, sessionType, sessionUrl]
    iterator += 1

In [None]:
yearlysessionTracker.head(25
                         )

In [None]:
billTracker = pd.DataFrame(data=None, index=None, columns= (['sessionUrl', 'chapter',  'chapterUrl', 'bill', 'billUrl', 'text', 'PresentmentDate', 'scrapeComplete']))
billTracker.head()

In [None]:
# Collecting List Bills in  a session
iterator = 0 ## to limit requests here replace <5 with < len(yearlysessionTracker.index) for complete scrape
while iterator < 15:
# while iterator < len(yearlysessionTracker.index):

    sessionUrl = yearlysessionTracker.loc[iterator]['sessionUrl']
    r = htmlRequester.get('http://' + sessionUrl)
    soup = bs(r.text, 'html.parser')
    chapters = soup.find_all('tr')
    try:
        for chapter in chapters[1:]:
            chapterUrl = chapter.find_all('a',href=True)[0]['href'].strip('//')
            chapterName =chapter.find_all('a',href=True)[0].text[7:]
            BillUrl = 'revisor.mn.gov/' + chapter.find_all('a',href=True)[1]['href'].strip('//')
            BillName = chapter.find_all('a',href=True)[1].text
            PresentmentDate = chapter.find_all('td')[2].text
            text = ""
            billTracker.loc[len(billTracker.index)] = [sessionUrl, chapterName, chapterUrl, BillName, BillUrl, text, PresentmentDate, False]
    except:
        billTracker.loc[len(billTracker.index)] = [sessionUrl, chapterName, chapterUrl, BillName, BillUrl, text, PresentmentDate, 'ERROR']
    iterator += 1
## TODO ASK DOMAIN EXPERT ABOUT CHAPTERS,for now we just continue towards bill text

In [None]:
searchDF(billTracker, 'scrapeComplete', 'Error')


In [None]:
billTracker.head()

In [None]:
## Finally we made it down to the bill page, which has so much good info on it.  
## for today, we are just going to get the bill text

In [None]:
iterator = 0 ## to limit requests here replace <5 with < len(billTracker.index) for complete scrape
while iterator < 5:
# while iterator < len(billTracker.index):
    billUrl = billTracker.loc[iterator]['billUrl']
    print(billUrl)
    
### 
    r = htmlRequester.get('http://' + billUrl)
    soup = bs(r.text, 'html.parser')
    billcard = soup.find_all(class_="card-body")
    billtextUrl = 'http://revisor.mn.gov/bills/' + billcard[0].find('a')['href'].strip('/bills')
    print(billtextUrl)

    r = htmlRequester.get(billtextUrl)
    soup = bs(r.text, 'html.parser')
    billText = soup.find(id='document').text.replace('\n', '')
    billTracker.iat[iterator, 5] = billText
    billTracker.iat[iterator, -1] = True

###
    
    
    iterator += 1


In [None]:
print(billTracker.head(1)['text'][0])
billTracker.head(1)


In [None]:
sessionTracker.merge(yearlysessionTracker.merge(billTracker))

In [None]:
sessionTracker.head(1)

In [None]:
yearlysessionTracker.head(1)

In [None]:
billTracker.head(1)

In [None]:
legislatureSessionCombo = sessionTracker.merge(yearlysessionTracker, how='outer')
legislatureSessionCombo.head(10)  

In [None]:
legislatureSessionCombo.tail(10)  ## At this point i realized that the territoral legislatures were broken somehow
# Examining the page showed a different style, fixing not needed at this point

In [None]:
finalDF = legislatureSessionCombo.merge(billTracker, how='outer', on='sessionUrl')

In [None]:
finalDF.head(10)

In [None]:
finalDF.to_csv('mnLaws.csv')

In [None]:
searchDF(finalDF, 'text', 'transportation')

In [None]:
searchDF(finalDF, 'text', 'eagle')  ## 92nd Legislature, 2021 1st Special Session
                                    ## from searchbar on revisor.gov
                                    ## couldnt find because query in CHAPTER text,
                                    ## not bill text, TODO WHAT IS CHAPTER

In [None]:
# Thank you for reading! This scraper is not totaly complete,