In [1]:
import requests
from datetime import datetime
import pandas as pd
from bs4 import BeautifulSoup
from math import isnan
import re
from os import listdir

## Cornell Data Collection

In [2]:
def clean_dates(dates, year1, year2):   
    '''
    converts start date string into start date epoch
    converts end date string into end date epoch
    adds proper school year to all dates
    
    dates: soup tag with dates
    year1: int of first possible school year
    year2: int of second possible school year
    
    returns string representations of start date and end date
    returns epoch representations of start date and end date
    
    '''
    if re.match('[A-Za-z]+ \d\d$', dates[1].text):
        end_date = dates[1].text  
        end_date_obj = datetime.strptime(end_date, '%b %d')
        end_date_obj = end_date_obj.replace(year=get_current_school_year(end_date_obj, year1, year2))
        end_date_epoch = datetime.timestamp(end_date_obj)
    else: 
        end_date = None
        end_date_epoch = None

    start_date = dates[0].text
    start_date_obj = datetime.strptime(start_date, '%b %d')
    start_date_obj = start_date_obj.replace(year=get_current_school_year(start_date_obj, year1, year2))
    start_date_epoch = datetime.timestamp(start_date_obj)
    return start_date, start_date_epoch, end_date, end_date_epoch

In [3]:
def check_if_class_related(event_name, remove_instructions=False):
    '''
    filters out events not closure-related, specific to Cornell academic Calendar
    '''
    word_list = ('no classes', 'recess', 'break', 'classes', 'instruction begins')
    bad_word_list = ('week', 'instruction') if remove_instructions else ['week']
    for word in word_list:
        for bad_word in bad_word_list:
            if bad_word in event_name.lower():
                return False
            if word in event_name.lower():
                return True
    return False

In [4]:
def get_school_year(date, return_both_years=False):
    # if date is [Jul, Dec], it is the earlier year
    # if date is in [Jan, June], it is the later year
    if date.month in range(1, 7):
        year1 = date.year - 1
        year2 = date.year
    else:
        year1 = date.year
        year2 = date.year + 1
    return (year1, year2)

In [5]:
def get_current_school_year(date, year1, year2):
    '''
    computes the current school year, given the two school years
    '''
    # if date is [Jul, Dec], it is the earlier year
    # if date is in [Jan, June], it is the later year
    return year2 if date.month in range(1, 7) else year1

In [6]:
def get_cornell_calendar(year1, year2):
    '''
    Scrapes the Cornell registrar calendar for key academic dates
    returns DataFrame of each school-closure related date 
    DataFrame includes cols Start Date, Start Date Epoch, End Date, End Date Epoch, and the Event Name
    
    earliest school year you can use is 2016-2017
    '''
    r = requests.get('https://registrar.cornell.edu/calendar/{}-{}'.format(year1, year2))
    if not r.ok:
        raise RuntimeError('Error connecting to registrar website')
    doc = r.text
    soup = BeautifulSoup(doc)
    
    calendar_rows = soup.findAll('div', {'class' : 'calendar-row'})
    rows = []
    for row in calendar_rows:
        event_title = row.find('div', {"class": "calendar-title"}).text
        
        #only add school-closure related classes
        if check_if_class_related(event_title):
            dates = row.findAll('time')
            start_date, start_date_epoch, end_date, end_date_epoch = clean_dates(dates, year1, year2)
            rows.append([start_date, start_date_epoch, end_date, end_date_epoch, event_title])
    return pd.DataFrame(rows, columns=['Start Date', 'Start Date Epoch', 'End Date', 'End Date Epoch', 'Event Name'])

## Main Function

In [7]:
def check_if_in_calendar(df, date):
    '''
    date: datetime object of date you want to check if it is in calendar
    
    turn dataframe 
    '''
    date = date.date()
    for row in df.iterrows():
        start_date_epoch = row[1]['Start Date Epoch']
        end_date_epoch = row[1]['End Date Epoch']

        #not a ranged date, only check if the day is the same
        if isnan(end_date_epoch):
            if date == datetime.fromtimestamp(start_date_epoch).date():
                return True
        
        # it is a range of dates
        else:
            start_date = datetime.fromtimestamp(start_date_epoch).date()
            end_date = datetime.fromtimestamp(end_date_epoch).date()
            if start_date <= date <= end_date:
                return True
    return False

In [8]:
def pop_school_year_range(df):
    '''
    Returns a 2 tuples of the start and end dates of the fall semester and the spring semester. 
    i.e. (start_date, end_date) and a dataframe with those dates removed
    
    df: dataframe of dates scrapped from academic calendar
    '''
    first_days = []
    last_days = []
    cleaned_df = []
    
    for row in df.iterrows():
        event_name = row[1]['Event Name'].lower()
        if not ('last day of classes' in event_name or 'instruction' in event_name):
            cleaned_df.append(row[1])
        if re.match('(fall|spring) \d\d instruction begins', event_name):
            first_days.append(row[1]['Start Date Epoch'])
        if 'last day of classes' in event_name:
            last_days.append(row[1]['Start Date Epoch'])
    if len(first_days) == 0 or len(last_days) == 0:
        raise RuntimeError('Could not find a start or end date')
    return (first_days[0], last_days[0]), (first_days[1], last_days[1]), pd.DataFrame(cleaned_df)

In [9]:
def check_if_class(timestamp, data_path='./data/academic-calendar-csvs'):
    '''
    will check if a given timestamp 
    does not include summer or winter sessions (classes during break)
    
    earliest school year you can use is 2016-2017
    '''
    # convert to datetime object to remove time of day when comparing dates
    date = datetime.fromtimestamp(timestamp) 
    
    # check if weekday
    if date.isoweekday() not in range(1, 6):
        return False
    
    # get Cornell academic calendar
    school_year = get_school_year(date)
    if school_year[0] <= 2015:
        raise RuntimeError('No holiday data available. The earliest school year available is 2016-2017.')
    
    if '{}-{}.csv'.format(school_year[0], school_year[1]) in listdir(data_path):
        df = pd.read_csv('{}/{}-{}.csv'.format(data_path, school_year[0], school_year[1]))
    else:
        df = get_cornell_calendar(school_year[0], school_year[1])
        df.to_csv('{}/{}-{}.csv'.format(data_path, school_year[0], school_year[1]))
    
    # check if in the school year range
    in_semester = []
    fall_range, spring_range, df = pop_school_year_range(df)
    for semester_range in (fall_range, spring_range):
        start_date = datetime.fromtimestamp(semester_range[0])
        end_date = datetime.fromtimestamp(semester_range[1])
        in_semester.append(start_date <= date <= end_date)
    if not(in_semester[0] or in_semester[1]):
        return False
    
    # check if holiday
    return not check_if_in_calendar(df, date)

In [10]:
check_if_class(datetime(2020, 5, 13).timestamp())

False