In [1]:
import os
import pandas as pd
from PyPDF2 import PdfReader

In [2]:
def extract_text_from_pdfs(directory_path):
    data = {'filename': [], 'text': []}
    
    for filename in os.listdir(directory_path):
        if filename.endswith(".pdf"):
            file_path = os.path.join(directory_path, filename)
            
            # Extract text from PDF
            text = ""
            with open(file_path, "rb") as pdf_file:
                pdf_reader = PdfReader(pdf_file)
                for page in pdf_reader.pages:
                    text += page.extract_text() or ""
                    
            # Append filename and text to data
            data['filename'].append(filename)
            data['text'].append(text)
    
    # Create DataFrame
    df = pd.DataFrame(data)
    return df

In [3]:
# Usage
directory_path = "all_syllabi/"
df = extract_text_from_pdfs(directory_path)

In [4]:
import re
df['cleaned_text'] = df['text'].apply(lambda x: re.sub(r'\s+', ' ', x))
df['course name'] = df['filename'].str.split(' - ').str[0].str.split('_').str[0]
df['course department'] = df['course name'].str.split(' ').str[0]

In [5]:
df['course department'].value_counts()[:30]

course department
ECON       190
MATH       127
HEB         75
STAT        64
OEB         60
MCB         42
APMTH       33
GOV         29
FRSEMR      24
PSY         23
COMPSCI     22
GENED       20
PHYSICS     19
E-PSCI      19
ENG-SCI     18
NEURO       18
SCRB        16
BIOSTAT     16
APCOMP      10
HISTSCI     10
SOCIOL       9
ASTRON       8
API          7
HDS          6
FYSEMR       6
PHIL         6
CHEM         6
LING         5
CELTIC       5
HIST         5
Name: count, dtype: int64

In [19]:
ndf = pd.read_csv('name_mapping/ALL.csv')
ndf = ndf.drop_duplicates('code')
print(ndf['code'].nunique())
ndf[ndf['code'].duplicated()].sort_values('code')
ndf['code2'] = ndf['code'].str.split('/').str[0]
ndf

1109


Unnamed: 0,Name,code,code2
0,MATH 168: Making Math Material,MATH 168,MATH 168
1,MATH 22A: Vector Calculus and Linear Algebra I,MATH 22A,MATH 22A
2,MATH ESPB: Emerging Scholars: Problem Solving ...,MATH ESPB,MATH ESPB
3,MATH ESPA: Emerging Scholars: Problem Solving ...,MATH ESPA,MATH ESPA
4,MATH 293X: Topological modular forms,MATH 293X,MATH 293X
...,...,...,...
1199,APCOMP 221: Critical Thinking in Data Science,APCOMP 221,APCOMP 221
1200,APCOMP 207/COMPSCI 107/COMPSCI 207: Systems De...,APCOMP 207/COMPSCI 107/COMPSCI 207,APCOMP 207
1201,APCOMP 297R: Computational Science and Enginee...,APCOMP 297R,APCOMP 297R
1202,APCOMP 298R: Interdisciplinary Seminar in Appl...,APCOMP 298R,APCOMP 298R


In [20]:
df = df.merge(ndf[['code2', 'Name']], left_on = 'course name', right_on = 'code2')
df = df[['filename',  'course department', 'course name','Name', 'text', 'cleaned_text']]
df

Unnamed: 0,filename,course department,course name,Name,text,cleaned_text
0,MATH 152 - 2024 Fall (116191).pdf,MATH,MATH 152,MATH 152: Discrete Mathematics,\t\nCourse\tgoals:\nBasic\tknowledge\tof\tmany...,Course goals: Basic knowledge of many areas o...
1,API 302_ECON 1415 - 2024 Fall (170053) (1).pdf,API,API 302,API 302/ECON 1415: Analytic Frameworks for Policy,\n8/27/24 1 API-302 \nFall 20 24 \...,8/27/24 1 API-302 Fall 20 24 A-1 API-302 ANAL...
2,MATH 256X - 2016 Fall (203030) (1).pdf,MATH,MATH 256X,MATH 256X: Heisenberg Calculus in Quantum Topo...,HEISENBERG CALCULUS IN QUANTUM TOPOLOGY\nMATH2...,HEISENBERG CALCULUS IN QUANTUM TOPOLOGY MATH25...
3,ECON 980BB - 2022 Spring (156369).pdf,ECON,ECON 980BB,ECON 980BB: Behavioral Economics,"Junior Seminar, Econ 980bb\nBehavioral Economi...","Junior Seminar, Econ 980bb Behavioral Economic..."
4,MATH 18B_19B - 2023 Spring (213535) (1).pdf,MATH,MATH 18B,"MATH 18B/19B: Linear Algebra, Probability, and...","Math\t18B/19B:\tLinear\tAlgebra,\tProbability,...","Math 18B/19B: Linear Algebra, Probability, and..."
...,...,...,...,...,...,...
1021,ECON 2329 - 2024 Spring (220377).pdf,ECON,ECON 2329,ECON 2329: Topics in Political Economy,Political Economy of Electoral Democracies\nPr...,Political Economy of Electoral Democracies Pro...
1022,SYSBIO 220QC - 2020 Spring 1 (204480).pdf,SYSBIO,SYSBIO 220QC,SYSBIO 220QC: Analysis foundations for quantit...,SB220/SB221 Syllabus \n \nSB220 \n12 classes...,SB220/SB221 Syllabus SB220 12 classes Week Dat...
1023,MATH 269Z - 2022 Spring (218254).pdf,MATH,MATH 269Z,MATH 269Z: Topics in Hodge Theory,Instructor:\n\tSalim\tTayou\nE-mail:\n\t\ntayo...,Instructor: Salim Tayou E-mail: tayou@math.har...
1024,MCB 188 - 2024 Spring (114864).pdf,MCB,MCB 188,MCB 188: Chromosomes,"MCB188 Chromosomes\nSpring, 2024\nClass will...","MCB188 Chromosomes Spring, 2024 Class will mee..."


In [21]:
df1 = df[df['course department'].isin(['MATH','STAT','APCOMP','APMTH','BIOSTAT'])]
df1

Unnamed: 0,filename,course department,course name,Name,text,cleaned_text
0,MATH 152 - 2024 Fall (116191).pdf,MATH,MATH 152,MATH 152: Discrete Mathematics,\t\nCourse\tgoals:\nBasic\tknowledge\tof\tmany...,Course goals: Basic knowledge of many areas o...
2,MATH 256X - 2016 Fall (203030) (1).pdf,MATH,MATH 256X,MATH 256X: Heisenberg Calculus in Quantum Topo...,HEISENBERG CALCULUS IN QUANTUM TOPOLOGY\nMATH2...,HEISENBERG CALCULUS IN QUANTUM TOPOLOGY MATH25...
4,MATH 18B_19B - 2023 Spring (213535) (1).pdf,MATH,MATH 18B,"MATH 18B/19B: Linear Algebra, Probability, and...","Math\t18B/19B:\tLinear\tAlgebra,\tProbability,...","Math 18B/19B: Linear Algebra, Probability, and..."
8,MATH MB - 2024 Spring (113464).pdf,MATH,MATH MB,MATH MB: Introduction to Functions and Calculu...,SyllabusSpring 2024Welcome (back) to Math Mb! ...,SyllabusSpring 2024Welcome (back) to Math Mb! ...
9,MATH 283Z - 2023 Spring (221658).pdf,MATH,MATH 283Z,MATH 283Z: Foundations of non-abelian Chabauty,Math 283Z Foundations of non-abelian Chabauty ...,Math 283Z Foundations of non-abelian Chabauty ...
...,...,...,...,...,...,...
1011,STAT 221 - 2020 Fall (115077).pdf,STAT,STAT 221,STAT 221: Computational Tools for Statistical ...,Fall 2020\nSTAT 221: Computational Tools for S...,Fall 2020 STAT 221: Computational Tools for St...
1015,MATH 156 - 2021 Fall (156000).pdf,MATH,MATH 156,MATH 156: Mathematical Foundations of Statisti...,Office\tHours:\nChris\tSunday\t7-9\tpm\tEliot\...,Office Hours: Chris Sunday 7-9 pm Eliot Dhall ...
1016,MATH 141A - 2024 Fall (207500).pdf,MATH,MATH 141A,MATH 141A: Mathematical Logic I,Course\tassistant:\n\tAva\tZinman\t\nPreston\t...,Course assistant: Ava Zinman Preston Bushnell ...
1018,APMTH 104 - 2016 Fall (122094).pdf,APMTH,APMTH 104,APMTH 104: Complex and Fourier Analysis with A...,APMTH 104: Series Expansions and Complex Analy...,APMTH 104: Series Expansions and Complex Analy...


In [22]:
df2 = df[~df['course department'].isin(['MATH','STAT','APCOMP','APMTH','BIOSTAT'])&(df['Name'].str.lower().str.contains('stat')|df['Name'].str.lower().str.contains('math')|df['Name'].str.lower().str.contains('quant') )]
df2

Unnamed: 0,filename,course department,course name,Name,text,cleaned_text
32,FRSEMR 40P_FYSEMR 40P - 2023 Fall (109469).pdf,FRSEMR,FRSEMR 40P,FRSEMR 40P/FYSEMR 40P: Appraising and Reimagin...,`\n\tCourse\t\nsyllabus\n.\tThere\tis\tsome\tf...,` Course syllabus . There is some flexibility ...
33,FRSEMR 40P_FYSEMR 40P - 2023 Fall (109469).pdf,FRSEMR,FRSEMR 40P,FRSEMR 40P: Making the Grade? Middle and High ...,`\n\tCourse\t\nsyllabus\n.\tThere\tis\tsome\tf...,` Course syllabus . There is some flexibility ...
45,PHYSCI 12A - 2024 Spring (109274) (2).pdf,PHYSCI,PHYSCI 12A,PHYSCI 12A: Mechanics and Statistical Physics ...,Spring 2024\nPhysical Sciences 12a\nInstructor...,Spring 2024 Physical Sciences 12a Instructors:...
52,SOCIOL 1156 - 2024 Fall (145331).pdf,SOCIOL,SOCIOL 1156,SOCIOL 1156: Statistics for Social Sciences,SOC\t1156\tSyllabus\tFall\t2024\t8-27-2024.pdf...,SOC 1156 Syllabus Fall 2024 8-27-2024.pdf SOCI...
95,COMPSCI 20 - 2024 Fall (128073) (1).pdf,COMPSCI,COMPSCI 20,COMPSCI 20: Discrete Mathematics for Computer ...,Harvard\tCollege/Graduate\tSchool\tof\tArts\ta...,Harvard College/Graduate School of Arts and Sc...
100,OEB 137 - 2023 Fall (216474).pdf,OEB,OEB 137,OEB 137: Experimental Design and Statistics fo...,OEB 137: Experimental Design & Statistics for ...,OEB 137: Experimental Design & Statistics for ...
136,GOV 2001 - 2024 Spring (124780) (1).pdf,GOV,GOV 2001,GOV 2001: Quantitative Social Science Methods I,\t\nPlease\tsee\tthe\tsyllabus\t\nhere\n.\nImp...,Please see the syllabus here . Important note...
156,GOV 2001 - 2019 Spring (124780).pdf,GOV,GOV 2001,GOV 2001: Quantitative Social Science Methods I,"Syllabus: Quantitative Social Science Methods,...","Syllabus: Quantitative Social Science Methods,..."
174,ECON 1126 - 2023 Fall (113637) (1).pdf,ECON,ECON 1126,ECON 1126: Quantitative Methods in Economics,Final\tResults\t(100\tPoints):\nMean:\t49\nMed...,Final Results (100 Points): Mean: 49 Median: 4...
195,CHEM 161 - 2024 Spring (113217).pdf,CHEM,CHEM 161,CHEM 161: Statistical Thermodynamics,Chemistry 161: Statistical Thermodynamics Ins...,Chemistry 161: Statistical Thermodynamics Inst...


In [23]:
df3 = pd.concat([df1, df2])
df3

Unnamed: 0,filename,course department,course name,Name,text,cleaned_text
0,MATH 152 - 2024 Fall (116191).pdf,MATH,MATH 152,MATH 152: Discrete Mathematics,\t\nCourse\tgoals:\nBasic\tknowledge\tof\tmany...,Course goals: Basic knowledge of many areas o...
2,MATH 256X - 2016 Fall (203030) (1).pdf,MATH,MATH 256X,MATH 256X: Heisenberg Calculus in Quantum Topo...,HEISENBERG CALCULUS IN QUANTUM TOPOLOGY\nMATH2...,HEISENBERG CALCULUS IN QUANTUM TOPOLOGY MATH25...
4,MATH 18B_19B - 2023 Spring (213535) (1).pdf,MATH,MATH 18B,"MATH 18B/19B: Linear Algebra, Probability, and...","Math\t18B/19B:\tLinear\tAlgebra,\tProbability,...","Math 18B/19B: Linear Algebra, Probability, and..."
8,MATH MB - 2024 Spring (113464).pdf,MATH,MATH MB,MATH MB: Introduction to Functions and Calculu...,SyllabusSpring 2024Welcome (back) to Math Mb! ...,SyllabusSpring 2024Welcome (back) to Math Mb! ...
9,MATH 283Z - 2023 Spring (221658).pdf,MATH,MATH 283Z,MATH 283Z: Foundations of non-abelian Chabauty,Math 283Z Foundations of non-abelian Chabauty ...,Math 283Z Foundations of non-abelian Chabauty ...
...,...,...,...,...,...,...
967,SYSBIO 220 - 2022 Spring (204480).pdf,SYSBIO,SYSBIO 220,SYSBIO 220/SYSBIO 320/SYSBIO 320QC: Quantitati...,SB220 Syllabus for 2021/2022 Spring Term\nSumm...,SB220 Syllabus for 2021/2022 Spring Term Summa...
970,FRSEMR 23P - 2022 Fall (109319).pdf,FRSEMR,FRSEMR 23P,"FRSEMR 23P: Physics, Math and Puzzles","\t\nPHYSICS,\tMATH\tAND\tPUZZLES\nCumrun\tVafa...","PHYSICS, MATH AND PUZZLES Cumrun Vafa (Depart..."
982,PHYSICS 15B - 2021 Fall (111896).pdf,PHYSICS,PHYSICS 15B,PHYSICS 15B: Introductory Electromagnetism and...,Course\tdescription:\nThis\tcourse\tis\tan\t\n...,Course description: This course is an undergra...
1002,ECON 2340 - 2020 Fall (215901).pdf,ECON,ECON 2340,ECON 2340: Transportation and Development: Evi...,Economics 2340: \n​\nTransportation and Develo...,Economics 2340: ​ Transportation and Developme...


In [24]:
df3.to_csv('processed_data/computation_courses.csv', index = False)