In [66]:
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import json
import re
from io import StringIO
import pickle

### Metadata

In [4]:
## Code to extract all metadata; by GPT 4 --> metadata_complete.pkl

md_final = {}

with open('course_pages.jsonl', 'r') as f:
   for line in tqdm(f):
       j = json.loads(line)
       md_final[j['url']] = {}
       soup = BeautifulSoup(j['text'], 'html.parser')
       h2 = soup.select_one("div.metadata > h2")
       if h2:
           md_final[j['url']]['title'] = h2.get_text(strip=True)
       for dt in soup.select("div.metadata dt"):
           dd = dt.find_next_sibling("dd")
           if dd:
               key = dt.get_text(strip=True)
               val = dd.get_text(strip=True)
               md_final[j['url']][key] = val

17it [00:00, 61.10it/s]


In [7]:
## Yields term; searches in the entry of metadata for matches; Gen by GPT4 --> time.pkl

season_year_dict = {}

for key, value in md_final.items():
    # Check the title field
    if 'title' in value:
        match = re.search(r'\b(Winter|Spring|Summer|Autumn|Fall)\s+\d{4}\b', value['title'])
        if match:
            season_year_dict[key] = match.group(0)
            continue  # Skip to the next item if found

    # Check the Project Title field
    if 'Project Title' in value:
        match = re.search(r'\b(Winter|Spring|Summer|Autumn|Fall)\s+\d{4}\b', value['Project Title'])
        if match:
            season_year_dict[key] = match.group(0)
            continue

    season_year_dict[key] = None

In [67]:
with open('data/intermediate/metadata.pkl', 'wb') as f:
    pickle.dump(md_final, f)

with open('data/intermediate/term.pkl', 'wb') as f:
    pickle.dump(season_year_dict, f)

### General Statistics

In [14]:
# Reference: https://stackoverflow.com/questions/71025130/how-to-extract-a-table-from-a-website-using-beautifulsoup
stats = {}

with open('course_pages.jsonl', 'r') as f:
   for line in tqdm(f):
       j = json.loads(line)
       soup = BeautifulSoup(j['text'], 'html.parser')

       tables = soup.find_all('table')

       stats[j['url']] = [pd.read_html(StringIO(str(t))) for t in tables]

17it [00:00, 19.43it/s]


In [68]:
with open('data/intermediate/stats.pkl', 'wb') as f:
    pickle.dump(stats, f)

### Interest

In [None]:
interest = {}

from tqdm import tqdm
import re

with open('raw_data/ocr_results.jsonl', 'r') as f:
    for line in tqdm(f):
        line = json.loads(line)

        url = line['course_url']
        image = line['questions']

        if url not in interest:
            interest[url] = {}
        
        for entry in image:
            q = entry['question']
            ocr_res = entry['ocr']

            if 'responses' not in ocr_res:
                continue

            text = ocr_res['responses'][0]['textAnnotations'][0]['description']

            
            if 'Now that' in q:
                pattern_prior = r'(Diminished|Satisfied|Heightened|Total)\s*[((]\s*(\d+)\s*[))]'
                matches = re.findall(pattern_prior, text, re.IGNORECASE)
                result = {label: int(value) for label, value in matches}
                interest[url][q] = result

                total = 0
                for tag, count in result.items():
                    if tag != 'Total':
                        total += count
                if result['Total'] != total:
                    print(f"Total mismatch for {url}: {result['Total']} != {total}")

            if 'Prior to' in q:
                pattern = r'(Very High|Very Low|Neutral|High|Low|Total)\s*\((\d+)\)'
                matches = re.findall(pattern, text, re.IGNORECASE)
                result = {label: int(value) for label, value in matches}
                interest[url][q] = result

                total = 0
                for tag, count in result.items():
                    if tag != 'Total':
                        total += count
                if result['Total'] != total:
                    print(f"Total mismatch for {url}: {result['Total']} != {total}")
            
        

16it [00:02,  5.88it/s]


In [69]:
with open('data/intermediate/interest.pkl', 'wb') as f:
    pickle.dump(interest, f)

### Professor Ratings

In [58]:
def select_prof_q(stats):
  dct = {}
  for course, dfs in tqdm(stats.items()):
    for df in dfs:
      if isinstance(df[0], pd.DataFrame):
        if df[0].columns[0] != 'Comments':
          for q in df[0].iloc[:, 0].values:

            if isinstance(q, str):
              q = q.lower()
              if q.startswith('overall, the instructor') or q.startswith('overall, this instructor'):
                dct[course] = df[0]
                break

  return dct

In [60]:
prof_q = select_prof_q(stats)

prof_q_df = pd.concat(
    [df.assign(source=key) for key, df in prof_q.items()],
    ignore_index=True
)

prof_q_df.to_csv('data/intermediate/prof_q.csv')

100%|██████████| 17/17 [00:00<00:00, 2901.45it/s]


### Challenge of the Course

In [61]:
def select_challenge_q(stats):
  dct = {}
  for course, dfs in tqdm(stats.items()):
    for df in dfs:
      if isinstance(df[0], pd.DataFrame):
        if df[0].columns[0] != 'Comments':
          for q in df[0].iloc[:, 0].values:

            if isinstance(q, str):
              q = q.lower()
              if q.startswith('this course challenged me intellectually'):
                dct[course] = df[0]
                break

  return dct

In [62]:
challenge_q = select_challenge_q(stats)

challenge_q_df = pd.concat(
    [df.assign(source=key) for key, df in challenge_q.items()],
    ignore_index=True
)

challenge_q_df.rename(columns={'Unnamed: 0': 'Question'}, inplace=True)
challenge_q_df.to_csv('data/intermediate/challenge_q.csv')

100%|██████████| 17/17 [00:00<00:00, 761.38it/s]
