In [None]:
import queue
import util
import json
import sys
import csv
import re
import bs4
from collections import deque

In [None]:
def get_redirected_url(url):
    """
    Return the redirected url of the given url.
    
    Takes a request object and returns the associated URL. Note that the
    returned URL may be different than the URL provided to the original call to
    get_request. This seeming anomaly occurs when the original URL redirects to
    another URL.
    """
    request_object = util.get_request(url)
    if not request_object:
        return 
    if request_object.status_code == 200:
        return util.get_request_url(request_object)

In [None]:
def request_and_parse_page(url):
    """
    Request and parse a page.
    """
    request_object = util.get_request(url)
    # if not `None`, the reqeust is successful. 
    if not request_object and request_object.status_code != 200:
        return 
    request_text = util.read_request(request_object)
    if not request_text:
        return 
    soup = bs4.BeautifulSoup(request_text, 'html5lib')
    # if soup does not have any `a` tag, return.
    if not soup.find_all('a'):
        return
    return soup, util.get_request_url(request_object)

def get_complete_url(parent_url, suburl):
    """
    Return the complete url of the given url.
    """
    # if it’s not an absolute url by using `util.is_absolute_url()`,
    # first keep only the part of the URL before #, that is, remove
    # fragment (`util.remove_fragment(url)`)  and then convert it to an
    # absolute url with `util.convert_if_relative_url(url1, url2)`. 
    if not util.is_absolute_url(suburl):
        # print('This is not an absolute url')
        sublink = util.remove_fragment(suburl)
        sublink = util.convert_if_relative_url(parent_url, sublink)
        return sublink
    else:
        return suburl

In [None]:
def check_url_ok_to_follow(redirected_url, q, limiting_domain):
    '''
    
    '''
    # Check whether it’s okay to follow using
    # `util.is_url_ok_to_follow(url, limiting_domain)`, if yes, and
    # if the url is not already in the queue append it (redirected
    # and complete URL)to the end of queue.
    if util.is_url_ok_to_follow(redirected_url, limiting_domain) and redirected_url not in q:
        # print('redirected-sublink is ok to follow:', redirected_url)
        q.append(redirected_url)
        print('queue size:', len(q))

    else:
        print('redirected-sublink is NOT ok to follow:', redirected_url)

In [None]:
def go(num_pages_to_crawl, course_map_filename, index_filename):
    '''
    Crawl the college catalog and generates a CSV file with an index.

    Inputs:
        num_pages_to_crawl: the number of pages to process during the crawl
        course_map_filename: the name of a JSON file that contains the mapping
          course codes to course identifiers
        index_filename: the name for the CSV of the index.

    Outputs:
        CSV file of the index index.
    '''

    starting_url = ("http://www.classes.cs.uchicago.edu/archive/2015/winter"
                    "/12200-1/new.collegecatalog.uchicago.edu/index.html")
    # starting_url = "https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/programsofstudy.1.html"
    limiting_domain = "classes.cs.uchicago.edu"
    
    q = []
    q.append(starting_url)
    visit_count = 0
    for link in q:
        print('-----------------------------------------------------')
        print('now scraping link:', link)
        print('It is the {}th link in the queue'.format(q.index(link)))
        print('queue size:', len(q))
        print('queue:', q)
        # if len(q) >= num_pages_to_crawl: break
        soup, real_link = request_and_parse_page(link)
        print('real_link:', real_link)
        if not soup: 
            continue
        a_tags = soup.find_all('a')
        for a_tag in a_tags:
            sublink = a_tag.get('href')
            complete_sublink = get_complete_url(link, sublink) 
            if complete_sublink is None: continue
            # redirected_url = get_redirected_url(complete_sublink)
            # if redirected_url is None: continue
            # if util.is_url_ok_to_follow(redirected_url, limiting_domain) and redirected_url not in q:
            #     q.append(redirected_url)
            #     if len(q) >= num_pages_to_crawl: break
            
            if util.is_url_ok_to_follow(complete_sublink, limiting_domain) and complete_sublink not in q:
                redirected_url = get_redirected_url(complete_sublink)
                visit_count += 1
                if redirected_url is None: continue
                # change https to http
                http_url = re.sub(r'https', 'http', redirected_url)
                # if complete_sublink and redirected_url and http_url are not in q, append them to q.
                if redirected_url not in q and http_url not in q:
                    q.append(redirected_url)
                if len(q) >= num_pages_to_crawl or visit_count >= num_pages_to_crawl: break

    return q

q = go(num_pages_to_crawl=100,
   course_map_filename='course_map.json',
   index_filename='catalog_index.csv')

In [None]:
def find_links(soup, parent_url):
    '''
    Given a soup, return a list of complete URLs in the page.
    '''
    external_links = []
    a_tags = soup.find_all('a')
    for a_tag in a_tags:
        sublink = a_tag.get('href')
        complete_sublink = get_complete_url(parent_url, sublink) 
        if complete_sublink is None: continue
        external_links.append(complete_sublink)
    return external_links

In [None]:
def filter_link(links, limiting_domain, visited_queue, to_be_crawled_queue):
    """
    Given a link, return True if it is a link to a course page.
    """
    filtered_links = []
    for external_link in links:
        if not util.is_url_ok_to_follow(external_link, limiting_domain):
            continue
        # change from https to http.
        http_external_link = re.sub(r"https", "http", external_link)
        if (
            http_external_link not in visited_queue
            and http_external_link not in to_be_crawled_queue
            and external_link not in to_be_crawled_queue
            and external_link not in visited_queue
        ):
            filtered_links.append(external_link)
    return filtered_links



In [242]:
from collections import deque

def go(num_pages_to_crawl, course_map_filename, index_filename):
    '''
    Crawl the college catalog and generates a CSV file with an index.

    Inputs:
        num_pages_to_crawl: the number of pages to process during the crawl
        course_map_filename: the name of a JSON file that contains the mapping
          course codes to course identifiers
        index_filename: the name for the CSV of the index.

    Outputs:
        CSV file of the index index.
    '''

    starting_url = ("http://www.classes.cs.uchicago.edu/archive/2015/winter"
                    "/12200-1/new.collegecatalog.uchicago.edu/index.html")
    limiting_domain = "classes.cs.uchicago.edu"
    
    result = defaultdict(set)
    
    to_be_crawled_queue = deque()
    to_be_crawled_queue.append(starting_url)
    visited_queue = deque()

    
    while len(visited_queue) < num_pages_to_crawl and len(to_be_crawled_queue) > 0:
        link = to_be_crawled_queue.popleft()
        print('now scraping link:', link)
        soup, real_link = request_and_parse_page(link)
        visited_queue.append(link)
        
        if not soup: 
            continue
        external_complete_links = find_links(soup, real_link)
        filtered_links = filter_link(external_complete_links, limiting_domain, visited_queue, to_be_crawled_queue)
        to_be_crawled_queue.extend(filtered_links)
        # Scrape course info.
        page_course_info_dict = scrape_course_content(soup)
        if not page_course_info_dict: continue 
        result = add_page_content_to_final_dict(page_course_info_dict, result)
        # for course_id, words in page_course_info_dict.items():
        #     if course_id not in result:
        #         result[course_id] = set(words)
        # else:
        #     result[course_id].update(words)

    print('visited_queue:', visited_queue)
    print('to_be_crawled_queue size:', len(to_be_crawled_queue))
    return result, visited_queue

result, visited_queue = go(num_pages_to_crawl=1000,
   course_map_filename='course_map.json',
   index_filename='catalog_index.csv')

now scraping link: http://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/index.html
now scraping link: https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/azindex/index.html
now scraping link: https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/introduction/index.html
now scraping link: https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/programsofstudy.1.html
now scraping link: https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/thecurriculum.1.html
now scraping link: https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/academicregulationsprocedures/index.html
now scraping link: https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/ex

In [243]:
len(visited_queue)

98

In [244]:
with open('course_id_words.csv', 'w') as f:
    for course_id, words in final_result.items():
        for word in words:
            f.write('{},{}\n'.format(course_id, word))

In [245]:
# Read the csv file into a pandas dataframe.
import pandas as pd
result_df = pd.read_csv('course_id_words.csv', header=None, names=['course_id', 'word'])
result_df = result_df.sort_values(by=['word', 'course_id'])
result_df.head(20)

Unnamed: 0,course_id,word
85971,1866,aanl
85992,1867,aanl
86013,1868,aanl
86039,1869,aanl
86047,1870,aanl
90240,1985,abandon
99265,2194,abandon
22396,484,abandoned
105238,2336,abandoned
55371,1175,abandonment


In [None]:
len(visited_queue)

In [None]:
set(visited_queue) - (set(all_links))

In [None]:
all_links = '''http://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/azindex/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/introduction/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/programsofstudy.1.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/thecurriculum.1.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/academicregulationsprocedures/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/examinationcreditandtransfercredit.1.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/interdisciplinaryopportunities/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/jointdegreeprograms/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/offcampusstudyprograms.1.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/preparationforprofessionalstudy/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/researchopportunities/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/contacts/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/academicadvising/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/academicintegrity/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/anthropology/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/arthistory/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/astronomyastrophysics/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/bigproblems/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/biologicalchemistry/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/biologicalsciences.1.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/chemistry.1.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/chicagostudies/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/cinemamediastudies/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/civilizationstudies/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/classicalstudies/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/comparativehumandevelopment/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/comparativeliterature/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/comparativeraceethnicstudies/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/caam/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/computationalneuroscience/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/computerscience/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/creativewriting/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/degreeprogramworksheet/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/earningadegree/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/eastasianlanguagescivilizations/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/economics/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/education/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/englishlanguageliterature.1.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/environmentalscience.1.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/environmentalstudies/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/fundamentalsissuesandtexts/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/genderstudies/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/geographicalstudies/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/geophysicalsciences.1.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/germanicstudies/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/gradingandacademicstatus.1.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/history/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/scienceandmedicinehips/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/humanities/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/wiztest/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/humanrights/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/interdisciplinarystudieshumanities.1.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/internationalrelations.1.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/internationalstudies/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/jewishstudies/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/jointdegreessa.1.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/jointdegreehumanities.1.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/jointdegreeurbanteaching.1.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/jointdegreecomsci.1.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/jointdegreemath.1.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/jointdegreestat.1.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/jointdegreechem.1.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/latinamericanstudies/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/lawlettersandsociety/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/liberaleducationatchicago/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/linguistics.1.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/mathematics/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/medievalstudies/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/molecularengineering/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/music/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/naturalsciences/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/neareasternlanguagescivilizations/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/newcollegiatedivision/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/philosophy/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/physicalsciences/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/physics/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/politicalscience/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/professionaloptionmedicine.1.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/professionaloptionpublicpolicystudies.1.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/psychology/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/publicpolicystudies/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/registration/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/religiousstudies/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/romancelanguagesliteratures/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/russianstudies/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/search/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/slaviclanguagesliteratures/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/socialsciences/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/sociology/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/southasianlanguagescivilizations/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/statistics/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/takingcourses/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/theaterperformancestudies/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/tutorialstudies/index.html
https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/visualarts/index.html'''.split()

In [None]:
len(all_links)

# Fetch website content

In [None]:
import unicodedata
import re

INDEX_IGNORE = set(['a', 'also', 'an', 'and', 'are', 'as', 'at', 'be',
                    'but', 'by', 'course', 'for', 'from', 'how', 'i',
                    'ii', 'iii', 'in', 'include', 'is', 'not', 'of',
                    'on', 'or', 's', 'sequence', 'so', 'social', 'students',
                    'such', 'that', 'the', 'their', 'this', 'through', 'to',
                    'topics', 'units', 'we', 'were', 'which', 'will', 'with',
                    'yet'])

In [None]:
url = 'https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/visualarts/index.html'
soup, real_url = request_and_parse_page(url)

In [None]:
# read json.
json_file_path = 'course_map.json'
with open(json_file_path, 'r') as f:
    course_map = json.load(f)

In [None]:
# use `div` to find sections (`<div class="courseblock main">`)  and <div class="courseblock subsequence">
div_tags = soup.find_all('div', class_='courseblock')

#  find course titles and descriptions (`<p class="courseblocktitle">`, `<p
#  class="courseblockdesc">`. 
final_course_id_words_dict = {}
for div_tag in div_tags:
    title_block_text = div_tag.find('p', class_='courseblocktitle').text
    title_block_text = unicodedata.normalize("NFKD", title_block_text)
    desc_block_text = div_tag.find('p', class_='courseblockdesc').text
    # print(repr(title_block_text))
    # print(desc_block_text)
    # Extract course code with regex. For example, `ARTV 10100`
    # course_code = re.search(r'[A-Z] [0-9]-[0-9]', title_block_text).group()
    course_code = title_block_text.split('.')[0]
    print('course_code:', course_code)
    course_code_list = split_course_code(course_code)
    course_id_list = [course_map[course_code] for course_code in course_code_list]
    # print('course_id_list', course_id_list)
    # Create course id dict. Key is course id, value is an empty set.
    # course_id_dict = {course_id: set() for course_id in course_id_list}
    # print('course_id_dict', course_id_dict)
    
    # Merge title text and desc text into one string. And lowercase it.
    lookup_str = title_block_text + ' ' + desc_block_text
    lookup_str = lookup_str.lower()
    # print(repr(lookup_str))
    
    # Use regex to extract words (we will define a word to be a string of length
    # at least one that starts with a letter and contains only letters, digits,
    # and/or an underscore(_))
    all_words = lookup_str.split()
    # print('all_words', all_words)
    word_pattern = re.compile(r'^[a-zA-Z][a-zA-Z0-9_]*$')
    matched_words = [word for word in all_words if word_pattern.match(word)]
    # print('matched_words', matched_words)
    
    #  filter out words included in `INDEX_IGNORE`
    filtered_matched_words = [word for word in matched_words if word not in INDEX_IGNORE]
    # print('filtered_matched_words', filtered_matched_words)
    
    # Add all filtered matched words to course id dict.
    for course_id in course_id_list:
        # if the course id is not in final_course_id_words_dict, add it. But it
        # has been there, we can add the content in filtered_matched_words to
        # the existing set.
        if course_id not in final_course_id_words_dict:
            final_course_id_words_dict[course_id] = set(filtered_matched_words)
        else:
            final_course_id_words_dict[course_id].update(filtered_matched_words)
    # print('course_id_dict', course_id_dict)
    
    
    
    # raise Exception('stop')
    
print(final_course_id_words_dict)

In [None]:
def split_course_code(course_code):
    '''
     # if hyphen is in the course code, create separate course codes for each
    # part. For example, ARTV 22000-22002 will be split into ARTV 22000 and ARTV 22002.
    '''
    
    if '-' in course_code:
        course_dept = course_code.split()[0]
        course_digit_list = course_code.split()[1].split('-')
        # concatenate course codes.
        course_code_list = [course_dept + ' ' + course_digit for course_digit in course_digit_list]
    else:
        course_code_list = [course_code]
    return course_code_list
course_code_list = split_course_code('ARTV 22000-22002')
print('course_code_list:', course_code_list)

In [None]:

urls = ['https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/visualarts/index.html']


def block_text_to_words(title_block_text, desc_block_text):
    '''
    From texts under title and description to filtered matched words.
    '''
    # Merge title text and desc text into one string. And lowercase it.
    lookup_str = title_block_text + ' ' + desc_block_text
    lookup_str = lookup_str.lower()

    all_words = lookup_str.split()
    # strip trailing punctuation.
    all_words = [word.rstrip('.,;:') for word in all_words]
    word_pattern = re.compile(r'^[a-zA-Z][a-zA-Z0-9_]*$')
    matched_words = [word for word in all_words if word_pattern.match(word)]
    
    filtered_matched_words = [word for word in matched_words if word not in INDEX_IGNORE]
    
    return filtered_matched_words

def extract_title_and_desc(div_tag):
    '''
    Extract title and description from a div tag.
    '''
    title_block_text = div_tag.find('p', class_='courseblocktitle').text
    title_block_text = unicodedata.normalize("NFKD", title_block_text)
    desc_block_text = div_tag.find('p', class_='courseblockdesc').text
    return title_block_text, desc_block_text

def extract_course_code_and_id(title_block_text):
    '''
    Extract course code from a title block text. Also extract course id.
    '''
    course_code = title_block_text.split('.')[0]
    course_code_list = split_course_code(course_code)
    course_id_list = [course_map[course_code] for course_code in course_code_list]
    return course_code_list, course_id_list
    
def scrape_course_content(soup):
    '''
    Scrape courses and clean words under each course. Assign all words to the
    course id. 
    '''
    # soup, real_url = request_and_parse_page(url)
    div_tags = soup.find_all('div', class_='courseblock')
    
    page_course_id_words_dict = {}
    for div_tag in div_tags:
        title_block_text, desc_block_text = extract_title_and_desc(div_tag)
        course_code_list, course_id_list = extract_course_code_and_id(title_block_text)
        filtered_matched_words = block_text_to_words(title_block_text, desc_block_text)
        
        # Add all filtered matched words to course id dict.
        for course_id in course_id_list:
            if course_id not in page_course_id_words_dict:
                page_course_id_words_dict[course_id] = set(filtered_matched_words)
            else:
                page_course_id_words_dict[course_id].update(filtered_matched_words)
        # print('course_id_dict', course_id_dict)
    return page_course_id_words_dict

soup, real_url = request_and_parse_page(urls[0])
page_course_id_words_dict = scrape_course_content(soup)

In [None]:
starting_url_course_info = scrape_course_content('http://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/index.html')

In [None]:

# Add content from a single page to the final course id words dict.
from collections import defaultdict
result = defaultdict(set)
def add_page_content_to_final_dict(page_course_id_words_dict, result):
    if not page_course_id_words_dict:
        return result
    for course_id, words in page_course_id_words_dict.items():
        if course_id not in result:
            result[course_id] = set(words)
        else:
            result[course_id].update(words)
    return result

result = add_page_content_to_final_dict(starting_url_course_info, result)

In [None]:
result

In [None]:
anthropology_course_info = scrape_course_content('https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/anthropology/index.html')

In [None]:
result = add_page_content_to_final_dict(anthropology_course_info, result)

In [None]:
result

In [None]:
result == final_course_id_words_dict

In [232]:

urls = ["https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/theaterperformancestudies/index.html",
"https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/tutorialstudies/index.html",
"https://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/thecollege/visualarts/index.html"]
def scrape_many_urls(urls):
    result = defaultdict(set)
    for url in urls:
        soup, real_url = request_and_parse_page(url)
        page_course_id_words_dict = scrape_course_content(soup)
        print('page_course_id_words_dict', page_course_id_words_dict)
        for course_id, words in page_course_id_words_dict.items():
            print('course_id', course_id)
            if course_id not in result:
                result[course_id] = set(words)
            else:
                result[course_id].update(words)
                
    return result

In [233]:
urls = ['http://www.classes.cs.uchicago.edu/archive/2015/winter/12200-1/new.collegecatalog.uchicago.edu/index.html']

result = scrape_many_urls(urls)
result

page_course_id_words_dict {}


defaultdict(set, {})

In [222]:
result[484]

{'abandoned',
 'affect',
 'all',
 'anxieties',
 'anxiety',
 'both',
 'centuries',
 'cinema',
 'class',
 'cmst',
 'confirm',
 'conform',
 'construct',
 'contains',
 'contemporary',
 'controversial',
 'countess',
 'critical',
 'culture',
 'daughters',
 'day',
 'dead',
 'defining',
 'directed',
 'discussed',
 'disturbing',
 'east',
 'eastern',
 'eli',
 'enforce',
 'europe',
 'european',
 'family',
 'fascination',
 'film',
 'films',
 'franchise',
 'genre',
 'has',
 'haunt',
 'health',
 'here',
 'highly',
 'horror',
 'hostel',
 'ideation',
 'interrogations',
 'it',
 'its',
 'julie',
 'landscape',
 'look',
 'man',
 'menaced',
 'mind',
 'nacho',
 'necessary',
 'need',
 'negotiating',
 'night',
 'one',
 'only',
 'ought',
 'own',
 'partnership',
 'pavel',
 'phantasies',
 'philosophies',
 'political',
 'popular',
 'produced',
 'range',
 'readings',
 'remains',
 'revenant',
 'rich',
 'scenes',
 'serbian',
 'sex',
 'source',
 'specific',
 'story',
 'symbol',
 'taxidermia',
 'term',
 'times',
 'tim

In [223]:
# Now write the result to a csv file.
# course_id1|word1
# course_id1|word2
# course_id1|word3
with open('course_id_words.csv', 'w') as f:
    for course_id, words in result.items():
        for word in words:
            f.write('{},{}\n'.format(course_id, word))

In [224]:
# Read the csv file into a pandas dataframe.
import pandas as pd
result_df = pd.read_csv('course_id_words.csv', header=None, names=['course_id', 'word'])

In [225]:
# first sort by word and then by course_id.
result_df = result_df.sort_values(by=['word', 'course_id'])

In [226]:
result_df.head(20)

Unnamed: 0,course_id,word
85971,1866,aanl
85992,1867,aanl
86013,1868,aanl
86039,1869,aanl
86047,1870,aanl
90240,1985,abandon
99265,2194,abandon
22396,484,abandoned
105238,2336,abandoned
55371,1175,abandonment
