In [22]:
#importing libraries
import requests
from bs4 import BeautifulSoup as bs
from requests import get
import pandas as pd
from datetime import date
from datetime import datetime
import time

#create timestamp from string in "mm/dd/yyyy" format
def maketimestamp(s):
    #if input string is "start" or "today", return corresponding timestamp
    if s == "start":
        return(1277960400)
    elif s == "today":
        return(int(time.mktime(date.today().timetuple())))


    #convert string into datetime format
    sdate = datetime.strptime(s, "%m/%d/%Y")

    #raise exception if date is outside range
    today = datetime.today()
    startdate = datetime.strptime("07/01/2010", "%m/%d/%Y")
    if sdate < startdate or sdate > today:
        raise Exception("Date must be between July 1, 2010, and Today")
    
    #convert datetime to timestamp and return
    timestamp = time.mktime(sdate.timetuple())
    return(int(timestamp))

#scrapes all UCPD incidents between two dates
#with no dates, will scrape between start date and today
#with one date, will scrape between start date and provided date
#with two dates, will scrape between provided dates 
#dates must be in "mm/dd/yyyy" format, but order of dates does not matter
def scrape(end = "today", start = "start"):
    #convert dates to timestamps
    startdate = maketimestamp(start)
    enddate = maketimestamp(end)

    #switch start and end if in opposite order
    if startdate > enddate:
        (startdate, enddate) = (enddate, startdate)
    
    #create url from start and end dates and return
    url = "https://incidentreports.uchicago.edu/incidentReportArchive.php?startDate=" \
        + str(startdate) + "&endDate=" + str(enddate)   

    #convert to beautiful soup and find div with page count
    page = get(url)
    soup = bs(page.content, 'html.parser')
    mydivs = soup.find_all("li", {"class": "page-count"})

    #convert page number html to a page num
    pagenum = str(mydivs[0].findChildren('span'))
    pagenum = int(pagenum.split("/")[1].split("<")[0])

    #iterate through all pages of incident query
    for i in range(0, pagenum):
        #change URL for each page
        offset = i * 5
        webpage =  url + "&offset=" + str(offset)
        
        #create dataframe for first page
        if offset == 0:
            df = pd.read_html(webpage)[0]

        #apend data from new pages to dataframe
        else:
            #get table data
            try:
                newdata = pd.read_html(webpage)[0]
                df = df.append(newdata)
            except:
                print("Error! Broken Page: " + webpage)
                
        #print statements for troubleshooting
        ##print(i)
        ##print(webpage)
    
    #reset index and return datafram
    df = df.reset_index().drop(["index"], axis=1)
    return(df)

In [25]:
#test scrape
df = scrape("start","07/30/2010")

In [24]:
df

Unnamed: 0,Incident,Location,Reported,Occurred,Comments / Nature of Fire,Disposition,UCPDI#
0,Lost Property,5810 S. University (Quad),7/1/10 12:42 PM,6/28/10 2:45 PM,Woman reports losing a Canon Power Shot digita...,Closed,W0731
1,Criminal Damage to Vehicle,61st & Drexel,7/1/10 2:01 PM,6/29/10 to 7/1/10 3:00 PM to 1:50 PM,Unknown person smashed windshield of a rental ...,Open,W0732
2,Theft from Motor Vehicle,60th between Ingleside & Ellis,7/1/10 2:48 PM,6/29/10 to 7/1/10 5:30 PM to 12:30 PM,Unknown person smashed rear passenger's side w...,Open,W0733
3,Theft,60th and Ellis,7/1/10 3:13 PM,6/23/10 to 6/29/10 9:00 AM to 5:00 PM,Bicycle taken from bike rack,Open,W0734
4,Chemical Spill,924 E. 57th St. (BSLC),7/1/10 6:15 PM,7/1/10 6:15 PM,"Bottle slipped from man's hands, broke causing...",Closed,W0735
...,...,...,...,...,...,...,...
156,Theft,55th between Cornell and Hyde Park,7/30/10 12:13 PM,7/30/10 12:10 PM,Unknown male customer grabbed money from cash ...,Open,W0887
157,VOID,Void,Void,Void,Void,Void,W0888
158,Theft,1115 E. 58th St. (Walker),7/30/10 3:05 PM,7/30/10 2:00 PM to 3:00 PM,Two bicycles chained together were taken from ...,Open,W0890
159,Theft,5812 S. Ellis (Brain Research),7/30/10 4:05 PM,7/30/10 9:00 AM to 3:45 PM,Unknown person took secured bicycle from bike ...,Open,W0889


In [None]:
df.to_csv("newoutput.csv")