In [1]:
import csv
import json
import pandas as pd
import re
import string

from nltk import word_tokenize
from nltk.corpus import stopwords
stops = set(stopwords.words('english'))

import pprint 
pp = pprint.PrettyPrinter(indent=2)

from faculty import scrapeFacultyData

In [2]:
def fetch_formatted_faculty_data():
    faculty_data = scrapeFacultyData()
    for faculty in faculty_data:
        curr_faculty_name = faculty['name'].strip()
        faculty_name_split = curr_faculty_name.split(' ')

        # get first and last name
        faculty_first_name = faculty_name_split[0]

        # special case for sarah van wart
        if curr_faculty_name == 'Sarah Van Wart':
            faculty_last_name = 'Van Wart'
        else:
            faculty_last_name = faculty_name_split[-1]

        # add name to dict
        faculty['first_name'] = faculty_first_name
        faculty['last_name'] = faculty_last_name

    return faculty_data

In [3]:
def fetch_and_parse_course_data(filepath):
    # load data
    data = pd.read_csv(filepath)

    # rename columns and remove any unnecessary ones
    data.columns = [name.strip().lower().replace(' ', '_') for name in list(data)]
    data = data[['qtr', 'course', 'title', 'instructor', 'course_topic_area', 'days', 'start', 'end', 'duration']]

    # convert quarter to full name
    data['qtr'].replace({'F': 'Fall', 'W': 'Winter', 'S': 'Spring'}, inplace=True)

    # add a parsed day column
    regex = r"([MWF]|(Tu)|(Th))"
    data['days'] = data['days'].apply(lambda x: [x[0] for x in re.findall(regex, x)])

    # parse instructors
    data['instructor'] = data['instructor'].apply(lambda x: x.split('/') if x != 'STAFF' else [])

    # replace blank course_topic_area with empty string
    data['course_topic_area'].fillna('', inplace=True)
    
    return data

In [4]:
def format_name(name_str):
    cleaned_str = name_str.strip().title().translate(str.maketrans('', '', string.punctuation))
    tokenized_name = word_tokenize(cleaned_str)
    stops_removed = [i for i in tokenized_name if i.lower() not in stops]
    return ''.join(stops_removed)

def instructor_names(instructors, faculty_list):
    pretty_instructor_names = []
    ontology_instructor_names = []
    
    for instructor in instructors:
        for faculty in faculty_list:
            if instructor.lower().strip() == faculty['last_name'].lower().strip():
                pretty_instructor_names.append(faculty['name'])
                ontology_instructor_names.append(faculty['ID'])
                break
    
    return pretty_instructor_names, ontology_instructor_names

def create_course_list(data, faculty_list):
    course_list = []

    # loop over each course in data
    for index, course in data.iterrows():
        formatted_names, ontology_names = instructor_names(course['instructor'], faculty_list)

        curr_course_dict = {
            'pretty_name': course['title'],
            'formatted_name': format_name(course['title']),
            'course_number': course['course'],
            'formatted_instructors': formatted_names,
            'ontology_instructors': ontology_names,
            'course_topic_area': course['course_topic_area'],
            'class_days': course['days'],
            'start_time': course['start'],
            'end_time': course['end'],
            'class_duration': course['duration']
        }
        course_list.append(curr_course_dict)
    
    return course_list

In [5]:
def convert_to_krf(course_list):
    pass

In [6]:
def main():
    filepath = '../data/cs-courses_2019-2020.csv'
    
    faculty_list = fetch_formatted_faculty_data()
    course_df = fetch_and_parse_course_data(filepath)
    
    return create_course_list(course_df, faculty_list)

In [7]:
course_list = main()
course_list

[{'pretty_name': 'Computer Science: Concepts, Philosophy, and Connections',
  'formatted_name': 'ComputerScienceConceptsPhilosophyConnections',
  'course_number': '101',
  'formatted_instructors': ['Larry Birnbaum'],
  'ontology_instructors': ['LarryBirnbaum'],
  'course_topic_area': '',
  'class_days': ['M', 'W', 'F'],
  'start_time': '14:00',
  'end_time': '14:50',
  'class_duration': '0:50'},
 {'pretty_name': 'Intro Programming for non-majors',
  'formatted_name': 'IntroProgrammingNonMajors',
  'course_number': '110',
  'formatted_instructors': ['Sarah Van Wart'],
  'ontology_instructors': ['SarahVanWart'],
  'course_topic_area': '',
  'class_days': ['M', 'W', 'F'],
  'start_time': '10:00',
  'end_time': '10:50',
  'class_duration': '0:50'},
 {'pretty_name': 'Intro Programming for non-majors',
  'formatted_name': 'IntroProgrammingNonMajors',
  'course_number': '110',
  'formatted_instructors': ['Sarah Van Wart'],
  'ontology_instructors': ['SarahVanWart'],
  'course_topic_area': '',

In [8]:
pp.pprint(course_list[0])

{ 'class_days': ['M', 'W', 'F'],
  'class_duration': '0:50',
  'course_number': '101',
  'course_topic_area': '',
  'end_time': '14:50',
  'formatted_instructors': ['Larry Birnbaum'],
  'formatted_name': 'ComputerScienceConceptsPhilosophyConnections',
  'ontology_instructors': ['LarryBirnbaum'],
  'pretty_name': 'Computer Science: Concepts, Philosophy, and Connections',
  'start_time': '14:00'}
