# Web scraper for UM academic calendar data

@Author: [Jeff Lockhart](http://www-personal.umich.edu/~jwlock/)

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time
import re

Example URLs: 

- http://ro.umich.edu/calendar/ss17.php
- http://ro.umich.edu/calendar/fa18.php
- http://ro.umich.edu/calendar/wn10.php

In [None]:
def exam(txt):
    txt = str(txt).lower()
    result = 0
    if 'exam' in txt:
        result = 1
    return result

def class_start(txt):
    txt = str(txt).lower()
    result = 0
    if 'classes' in txt:
        if 'begin' in txt:
            result = 1
        elif 'resume' in txt:
            result = 1
    return result

def class_stop(txt):
    txt = str(txt).lower()
    result = 0
    if 'classes' in txt:
        if 'end' in txt:
            result = 1
    elif 'recess' in txt:
        result = 1
    elif 'vacation' in txt:
        if 'begin' in txt:
            result = 1
    return result

def get_dates(txt, y):
    date = None
    m = re.search('^(\w+\s\d+)', str(txt))
    if m:
        date = m.group(1)
        date += ', '+str(y)
        date = pd.to_datetime(date)
    
    return date

def get_table(page, y):
    #parse page with bs4
    soup = BeautifulSoup(page, 'html.parser')
    #select just the table of interest
    table = soup.find('table')

    labels = ['event', 'times']
    data = []
    rows = table.find_all('tr')
    #each row is a game
    for r in rows:
        tmp = {}
        
        for i, txt in enumerate(r.find_all('td')):
            tmp[labels[i]] = txt.text

        data.append(tmp)
    #convert our findings to a dataframe
    df = pd.DataFrame(data)

    
    df['date'] = df.times.apply(get_dates, y=y)
    df['exams'] = df.event.apply(exam)
    df['class_start'] = df.event.apply(class_start)
    df['class_stop'] = df.event.apply(class_stop)
    
    #drop the pesky null rows
    df = df.dropna(axis=0, how='any')
    
    return df

In [None]:
df = pd.DataFrame()

terms = ['fa', 'wn']
years = range(2003, 2019)
base_url = 'http://ro.umich.edu/calendar/'
end_url = '.php'

for y in years:
    for t in terms:
        #get the page for this term
        url = base_url+t+str(y)[2:]+end_url
        r = requests.get(url)

        #if the page exists
        if r.status_code == 200:
            print('Processing', url)
            tmp = get_table(r.content, y)
            df = pd.concat([df, tmp])
        else:
            #some years don't have data. Ignore them and move on.
            print('Error with', url)

        #wait to be a polite lil spider
        time.sleep(2)
    
df.shape

In [None]:
df.head()

In [None]:
#sort our data and peak at it.
df = df.sort_values(by='date')
df.head()

In [None]:
df.to_csv('../data/UM_academic_calendar_no_summer.tsv', 
          sep='\t', index=False)

In [None]:
data = []
tmp = {}

for r in df.iterrows():
    if r[1].class_start == 1:
        data.append(tmp)
        tmp = {}
        tmp['class_start'] = r[1].date
    elif r[1].class_stop == 1:
        tmp['class_end'] = r[1].date
        
classes = pd.DataFrame(data)
classes = classes[['class_start', 'class_end']]
classes = classes.dropna(axis=0)
classes

In [None]:
classes.to_csv('../data/UM_class_periods_no_summer.tsv', 
          sep='\t', index=False)

In [None]:
terms = ['ss']
years = range(2003, 2019)
base_url = 'http://ro.umich.edu/calendar/'
end_url = '.php'

for y in years:
    for t in terms:
        #get the page for this term
        url = base_url+t+str(y)[2:]+end_url
        r = requests.get(url)

        #if the page exists
        if r.status_code == 200:
            print('Processing', url)
            tmp = get_table(r.content, y)
            df = pd.concat([df, tmp])
        else:
            #some years don't have data. Ignore them and move on.
            print('Error with', url)

        #wait to be a polite lil spider
        time.sleep(2)
    
df.shape

In [None]:
#sort our data and peak at it.
df = df.sort_values(by='date')
df.head()

In [None]:
df.to_csv('../data/UM_academic_calendar.tsv', sep='\t', index=False)

In [None]:
data = []
tmp = {}

for r in df.iterrows():
    if r[1].class_start == 1:
        data.append(tmp)
        tmp = {}
        tmp['class_start'] = r[1].date
    elif r[1].class_stop == 1:
        tmp['class_end'] = r[1].date
        
classes = pd.DataFrame(data)
classes = classes[['class_start', 'class_end']]
classes = classes.dropna(axis=0)
classes

In [None]:
classes.to_csv('../data/UM_class_periods.tsv', sep='\t', index=False)

In [None]:
df[df.date.dt.year == 2015]

In [None]:
classes[classes.class_start.dt.year == 2014]