In [1]:
import csv
import json
import pandas as pd
import re
import string

from nltk import word_tokenize
from nltk.corpus import stopwords
stops = set(stopwords.words('english'))

import pprint 
pp = pprint.PrettyPrinter(indent=2)

from faculty import scrapeFacultyData

In [2]:
faculty_data = scrapeFacultyData()
for faculty in faculty_data:
    curr_faculty_name = faculty['name'].strip()
    faculty_name_split = curr_faculty_name.split(' ')
    
    # get first and last name
    faculty_first_name = faculty_name_split[0]
    
    # special case for sarah van wart
    if curr_faculty_name == 'Sarah Van Wart':
        faculty_last_name = 'Van Wart'
    else:
        faculty_last_name = faculty_name_split[-1]
        
    # add name to dict
    faculty['first_name'] = faculty_first_name
    faculty['last_name'] = faculty_last_name

In [3]:
# load data
data = pd.read_csv('../data/cs-courses_2019-2020.csv')

# rename columns and remove any unnecessary ones
data.columns = [name.strip().lower().replace(' ', '_') for name in list(data)]
data = data[['qtr', 'course', 'title', 'instructor', 'course_topic_area', 'days', 'start', 'end', 'duration']]

# convert quarter to full name
data['qtr'].replace({'F': 'Fall', 'W': 'Winter', 'S': 'Spring'}, inplace=True)

# add a parsed day column
regex = r"([MWF]|(Tu)|(Th))"
data['days'] = data['days'].apply(lambda x: [x[0] for x in re.findall(regex, x)])

# parse instructors
data['instructor'] = data['instructor'].apply(lambda x: x.split('/') if x != 'STAFF' else [])

# replace blank course_topic_area with empty string
data['course_topic_area'].fillna('', inplace=True)

# show data
data.head()

Unnamed: 0,qtr,course,title,instructor,course_topic_area,days,start,end,duration
0,Fall,101,"Computer Science: Concepts, Philosophy, and Co...",[Birnbaum],,"[M, W, F]",14:00,14:50,0:50
1,Fall,110,Intro Programming for non-majors,[Van Wart],,"[M, W, F]",10:00,10:50,0:50
2,Spring,110,Intro Programming for non-majors,[Van Wart],,"[M, W, F]",11:00,11:50,0:50
3,Winter,110,Intro Programming for non-majors,[Van Wart],,"[Tu, Th]",16:00,17:20,1:20
4,Fall,111,Fundamentals of Computer Programming 1,"[Horswill, Wilson]",,"[M, W, F]",13:00,13:50,0:50


In [4]:
def format_name(name_str):
    cleaned_str = name_str.strip().title().translate(str.maketrans('', '', string.punctuation))
    tokenized_name = word_tokenize(cleaned_str)
    stops_removed = [i for i in tokenized_name if i.lower() not in stops]
    return ''.join(stops_removed)

def instructor_names(instructors):
    pretty_instructor_names = []
    ontology_instructor_names = []
    
    for instructor in instructors:
        for faculty in faculty_data:
            if instructor.lower().strip() == faculty['last_name'].lower().strip():
                pretty_instructor_names.append(faculty['name'])
                ontology_instructor_names.append(faculty['ID'])
                break
    
    return pretty_instructor_names, ontology_instructor_names

# convert to dict
course_list = []

for index, course in data.iterrows():
    formatted_names, ontology_names = instructor_names(course['instructor'])
    
    curr_course_dict = {
        'pretty_name': course['title'],
        'formatted_name': format_name(course['title']),
        'course_number': course['course'],
        'formatted_instructors': formatted_names,
        'ontology_instructors': ontology_names,
        'course_topic_area': course['course_topic_area'],
        'class_days': course['days'],
        'start_time': course['start'],
        'end_time': course['end'],
        'class_duration': course['duration']
    }
    course_list.append(curr_course_dict)

In [5]:
pp.pprint(course_list[0])

{ 'class_days': ['M', 'W', 'F'],
  'class_duration': '0:50',
  'course_number': '101',
  'course_topic_area': '',
  'end_time': '14:50',
  'formatted_instructors': ['Larry Birnbaum'],
  'formatted_name': 'ComputerScienceConceptsPhilosophyConnections',
  'ontology_instructors': ['LarryBirnbaum'],
  'pretty_name': 'Computer Science: Concepts, Philosophy, and Connections',
  'start_time': '14:00'}
