## Prologue

In [1]:
%%capture
!pip install camelot-py
!apt install ghostscript python3-tk
!pip install ghostscript

In [2]:
from google.colab import drive
import os
import camelot.io as camelot
import pandas as pd
from datetime import datetime

drive.mount('/content/drive/', force_remount=True)
pd.set_option('display.max_columns', 6)

Mounted at /content/drive/


## Parsing into Tables

In [3]:
column_names = 'Mon', 'Tues', 'Wed', 'Thurs', 'Fri'
dict_of_schedules = {}
aaiv_leaders = []

schedules_path = '/content/drive/MyDrive/UTK_Schedule/schedules'
for file in os.listdir(schedules_path): # ITERATES THROUGH EACH PDF IN SCHEDULES DIR
  schedule_path = os.path.join(schedules_path, file)
  table = camelot.read_pdf(schedule_path) # TABULATE
  df = table[0].df
  schedule = pd.DataFrame(columns=column_names) # DECLARE INITIAL DATAFRAME
  name = str(file).split(sep='.')[0].replace('-', ' ').title() # NAME
  aaiv_leaders.append(name)
  for idx in range(1, len(df)):
    schedule_data = [df[1][idx], df[2][idx], df[3][idx], df[4][idx], df[5][idx]] # APPENDING SCHEDULE DATA TO DATAFRAME
    schedule.loc[len(schedule)] = schedule_data
  schedule = schedule.replace({'\n':''}, regex=True) # CLEANING
  schedule = schedule.replace({'ONLINE-AS', ''}, regex=True)
  schedule = schedule.replace({'SHOWN', ''}, regex=True)
  dict_of_schedules[name] = schedule # APPENDING DATAFRAME VALUE TO NAME KEY
dict_of_schedules['Samuel Sui'].head()

Unnamed: 0,Mon,Tues,Wed,Thurs,Fri
0,8:00A- 8:50A MKB-421 Computer Science:302 Lab,9:45A- 11:00A MKB-404 Computer Science:302 Lec...,9:10A- 10:00A DOU-416 Electrical and Computer ...,9:45A- 11:00A MKB-404 Computer Science:302 Lec...,9:10A- 10:00A DOU-416 Electrical and Computer ...
1,9:10A- 10:00A DOU-416 Electrical and Computer ...,2:30P- 3:45P MKB-524 Computer Science:311 Class,10:20A- 11:10A AYR-124 Mathematics:251 Class,2:30P- 3:45P MKB-524 Computer Science:311 Class,10:20A- 11:10A AYR-124 Mathematics:251 Class
2,10:20A- 11:10A AYR-124 Mathematics:251 Class,,,,


## Cleaning Data

### Parsing Time

In [4]:
def time_parse(data):
  data[0] = data[0].replace('-', '') # FORMATTING
  data[0] = data[0]+'M'
  data[1] = data[1]+'M'
  if len(data[0]) == 5:
    data[0] = '0'+data[0]
  if len(data[1]) == 5:
    data[1] = '0'+data[1]

  start_time = datetime.strptime(data[0], "%I:%M%p") # PARSING TIME
  end_time = datetime.strptime(data[1], "%I:%M%p")
  midnight = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
  start_time = start_time.time()
  end_time = end_time.time()
  return [start_time, end_time]

### Processing into DataFrame

In [5]:
times_df = pd.DataFrame(columns=['Day of Week', 'Name', 'Location', 'Class', 'Start Time', 'End Time'])

for key in dict_of_schedules: # Dict of Schedules (DataFrames)
  for col, cell in dict_of_schedules[key].items(): # Cells in Schedules (Frame)
    for row in cell: # Rows within Series
      data = row.split()
      if len(data):
        if 'SHOWN' in data:
          data.remove('SHOWN')
        times = time_parse(data)
        schedule_class = ""
        for i in range(3, len(data)):
          schedule_class += data[i] + ' '
        times_df.loc[len(times_df)] = [col, key, data[2], schedule_class.rstrip(), times[0], times[1]]
times_df.to_csv('times.csv')
times_df.head()

Unnamed: 0,Day of Week,Name,Location,Class,Start Time,End Time
0,Mon,Samuel Sui,MKB-421,Computer Science:302 Lab,08:00:00,08:50:00
1,Mon,Samuel Sui,DOU-416,Electrical and Computer Engr:313 Class,09:10:00,10:00:00
2,Mon,Samuel Sui,AYR-124,Mathematics:251 Class,10:20:00,11:10:00
3,Tues,Samuel Sui,MKB-404,Computer Science:302 Lecture,09:45:00,11:00:00
4,Tues,Samuel Sui,MKB-524,Computer Science:311 Class,14:30:00,15:45:00
