# Syllabi Parsing

By author: Mingfei Ma<br>
Data: 1/31/2021

In [1]:
import pandas as pd
import re
import PyPDF2
import os

## Define a function to catch all features

In [19]:
def syllabus_characteristics(content):
    feature = dict()
    
    #1.instructor name
    i_name = re.compile("[A-Za-z]+((\s)+([A-Za-z])+)")
    match = i_name.search(content)
    if match:
        feature["name"]= match.group()    
    else:
        feature["name"]="no searches"
        
    #2.instructor email
    address = re.compile("[a-zA-Z0-9_-]+@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)")
    match = address.search(content)
    if match:
        feature["Emails"]= match.group()
    else:
        feature["Emails"]="no searches"        
        
    #3.phone number
    phone_number = re.compile("([1-9]\d{9})|([1-9]\d{2}-\d{3}-\d{4})")
    match = phone_number.search(content)
    if match:
        feature["phone number"]= match.group()
    else:
        feature["phone number"]="no searches"        
    #4.instructor office hour
    office_hour = re.compile("Mounday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday")
    match = office_hour.search(content)
    if match:
        feature["office_hour"]= match.group()
    else:
        feature["office_hour"]="no searches"  
    #5.Schedule
    schedule = re.compile("(([0-1]?[0-9]):[0-5][0-9]\n\-([0-1]?[0-9]):[0-5][0-9])\s+[a-z]m")
    match = schedule.search(content)
    if match:
        feature["Schedule"]= match.group()    
    else:
        feature["Schedule"]="no searches"
    #6.assignments number
    a_number = re.compile("[0-9]+\s+assignments")
    match = a_number.search(content)
    if match:
        feature["assignments number"]= match.group() 
    else:
        feature["assignments number"]="no searches"        
    #7.whether exam needed
    if re.search("exam", content.lower(), re.IGNORECASE):
        feature["exam"]= "Yes"
    else:
        feature["exam"]= "No"
    #8.Whether prerequisites

    if re.search("prerequisite", content.lower(), re.IGNORECASE):
        feature["prerequisite"]= "Yes"
    else:
        feature["prerequisite"]= "No"        
    #9.Whether textbook needed
    if re.search("textbook", content.lower(), re.IGNORECASE):
        feature["textbook"]= "Yes"
    else:
        feature["textbook"]= "No"        
    #10.Type of class
    if re.search("online", content.lower(), re.IGNORECASE):
        feature["online"]= "Yes"
    else:
        feature["online"]= "No"        
    return(feature)

## Creat an empty dataframe in order to recieve features

In [20]:
features_retrieved = pd.DataFrame()

## Read files and extract features

In [21]:
files= os.listdir("syllabi/")

In [22]:
flst=[]
for file in files:
    if os.path.splitext(file)[1] == ".pdf":
        flst.append(file)

In [23]:
flst=["syllabi/"+f for f in flst]

In [24]:
flst

['syllabi/s2.pdf', 'syllabi/s3.pdf', 'syllabi/s1.pdf']

In [25]:
for pf in flst:
    
    features = []
    file=""
    pdf = open(pf, 'rb')
    pdfReader=PyPDF2.PdfFileReader(pdf)
    numbers = pdfReader.numPages
    for number in range(numbers):
        pageObj = pdfReader.getPage(number) 
        content = pageObj.extractText()
        file= file+content
    pdf.close() 
    #apply function to catch
    if syllabus_characteristics(file):
        features.append(syllabus_characteristics(file))
    #
    
    #write the features into dataframe
    
    features_retrieved=features_retrieved.append(features,ignore_index=True)

In [26]:
features_retrieved

Unnamed: 0,name,Emails,phone number,office_hour,Schedule,assignments number,exam,prerequisite,textbook,online
0,Derick Jones,das@safh.com,209-946-3221,Sunday,6:00\n-8:00 pm,5 assignments,No,No,No,Yes
1,Lance Jones,das@safh.com,no searches,Sunday,6:00\n-8:00 pm,55 assignments,No,No,No,No
2,David Jones,das@safh.com,2099463221,Sunday,6:00\n-8:00 pm,5 assignments,No,Yes,Yes,Yes


## Save dataframe as .csv file

In [10]:
features_retrieved.to_csv("features-retrieved.csv")