In [28]:
#importing libraries
import requests
from bs4 import BeautifulSoup as bs
from requests import get
import pandas as pd
from datetime import date
from datetime import datetime
import time

#create timestamp from string in "mm/dd/yyyy" format
def maketimestamp(s, page = "police"):
    print(s)
    #if input string is "start" or "today", return corresponding timestamp
    if s == "start":
        if page == "police": return(1277960400)
        else: return(1433134800)
    elif s == "today":
        return(int(time.mktime(date.today().timetuple())))

    #convert string into datetime format
    sdate = datetime.strptime(s, "%m/%d/%Y")

    #raise exception if date is outside range
    today = datetime.today()
    startdate = datetime.strptime("07/01/2010", "%m/%d/%Y")
    if sdate < startdate or sdate > today:
        raise Exception("Date must be between July 1, 2010, and Today")
    
    #convert datetime to timestamp and return
    timestamp = time.mktime(sdate.timetuple())
    return(int(timestamp))

#scrapes all UCPD incidents between two dates
#with no dates, will scrape between start date and today
#with one date, will scrape between start date and provided date
#with two dates, will scrape between provided dates 
#dates must be in "mm/dd/yyyy" format, but order of dates does not matter
def scrape(start, end, page):
    #convert dates to timestamps
    startdate = maketimestamp(start, page)
    enddate = maketimestamp(end, page)

    #switch start and end if in opposite order
    if startdate > enddate:
        (startdate, enddate) = (enddate, startdate)

    #choose URL based on wheter page is police, traffic or field Interviews
    if page == "police":
        urlstart = "https://incidentreports.uchicago.edu/incidentReportArchive.php?startDate="
    elif page == "traffic":
        urlstart = "https://incidentreports.uchicago.edu/trafficStopsArchive.php?startDate="
    elif page == "interviews":
        urlstart = "https://incidentreports.uchicago.edu/fieldInterviewsArchive.php?startDate="

    #create url from start and end dates and return
    url =  urlstart + str(startdate) + "&endDate=" + str(enddate)   

    #convert to beautiful soup and find div with page count
    page = get(url)
    soup = bs(page.content, 'html.parser')
    mydivs = soup.find_all("li", {"class": "page-count"})

    #convert page number html to a page num
    pagenum = str(mydivs[0].findChildren('span'))
    pagenum = int(pagenum.split("/")[1].split("<")[0])

    #iterate through all pages of incident query
    for i in range(0, pagenum):
        #change URL for each page
        offset = i * 5
        webpage =  url + "&offset=" + str(offset)
        
        #create dataframe for first page
        if offset == 0:
            df = pd.read_html(webpage)[0]

        #apend data from new pages to dataframe
        else:
            #get table data
            try:
                newdata = pd.read_html(webpage)[0]
                df = df.append(newdata)
            except:
                print("Error! Broken Page: " + webpage)
                
        #print statements for troubleshooting
        ##print(i)
        ##print(webpage)
    
    #reset index and return datafram
    df = df.reset_index().drop(["index"], axis=1)
    return(df)


#scrapes all UCPD Daily Incident reports between two days
def scrape_police(end = "today", start = "start"):
    df = scrape(start, end, "police")
    return(df)

#scrapes all UCPD Traffic Stops between two days
def scrape_traffic(end = "today", start = "start"):
    df = scrape(start, end, "traffic")
    return(df)

#scrapes all UCPD Field Interviews between two days
def scrape_interviews(end = "today", start = "start"):
    df = scrape(start, end, "interviews")
    return(df)

In [31]:
#test scrape
df = scrape_interviews("start", "12/31/2021")

12/31/2021
start


In [32]:
df

Unnamed: 0,Date/Time,Location,Initiated By,Race,Gender,Reason for Stop,Disposition,Search
0,No field interviews for 06/01/2015,No field interviews for 06/01/2015,No field interviews for 06/01/2015,No field interviews for 06/01/2015,No field interviews for 06/01/2015,No field interviews for 06/01/2015,No field interviews for 06/01/2015,No field interviews for 06/01/2015
1,No field interviews for 06/02/2015,No field interviews for 06/02/2015,No field interviews for 06/02/2015,No field interviews for 06/02/2015,No field interviews for 06/02/2015,No field interviews for 06/02/2015,No field interviews for 06/02/2015,No field interviews for 06/02/2015
2,6/3/2015 1:40 PM,1601 E 53rd,Citizen request for UCPD Response,African American,Female,Citizen observed subject having a verbal argum...,Name checked; no further action,No
3,6/3/2015 1:40 PM,1601 E 53rd,Citizen request for UCPD Response,African American,Male,Citizen observed subject having a verbal argum...,Name checked; no further action,No
4,6/4/2015 8:21 PM,5245 S Cottage Grove,Citizen request for UCPD Response,African American,Male,Complainant advised subject acted suspicious (...,Name checked; no further action,No
...,...,...,...,...,...,...,...,...
2217,There were no incidents to report.,,,,,,,
2218,There were no incidents to report.,,,,,,,
2219,There were no incidents to report.,,,,,,,
2220,There were no incidents to report.,,,,,,,


In [33]:
df.to_csv("interviews_output.csv")