# Demo: Web Scraping and Data Preprocessing with the GT Catalog

### Automate collecting GT course catalog website URLs for all the schools.

In [2]:
# load required packages
import requests # grab the content from the websites
from bs4 import BeautifulSoup #https://www.crummy.com/software/BeautifulSoup/bs4/doc/
import csv
import pandas as pd

In [3]:
url = "https://catalog.gatech.edu/courses-undergrad/" # undergraduate level courses

page = requests.get(url) # fetch the page

soup = BeautifulSoup(page.text, 'html.parser') # parse the HTML content

In [4]:
# Webpages are structured using HTML, and links are typically stored in <a> (anchor) tags.

school_list = []
for item in soup.findAll('a'): # find all anchor (<a>) tags
    try:
        if '/courses-undergrad/' in item['href']: # check "/courses-undergrad/" is inside "href"
            school_list.append(item)
    except KeyError:   # skip any missing 'href' attributes
        pass

school_list

  for item in soup.findAll('a'): # find all anchor (<a>) tags


[<a href="/courses-undergrad/">Undergraduate-​Level Courses</a>,
 <a href="/courses-undergrad/">Undergraduate-​Level Courses</a>,
 <a href="/courses-undergrad/acct/">Accounting (ACCT)</a>,
 <a href="/courses-undergrad/ae/">Aerospace Engineering (AE)</a>,
 <a href="/courses-undergrad/as/">Air Force Aerospace Studies (AS)</a>,
 <a href="/courses-undergrad/apph/">Applied Physiology (APPH)</a>,
 <a href="/courses-undergrad/arbc/">Arabic (ARBC)</a>,
 <a href="/courses-undergrad/arch/">Architecture (ARCH)</a>,
 <a href="/courses-undergrad/bios/">Biological Sciences (BIOS)</a>,
 <a href="/courses-undergrad/biol/">Biology (BIOL)</a>,
 <a href="/courses-undergrad/bmed/">Biomedical Engineering (BMED)</a>,
 <a href="/courses-undergrad/bc/">Building Construction (BC)</a>,
 <a href="/courses-undergrad/cetl/">Center Enhancement-Teach/Learn (CETL)</a>,
 <a href="/courses-undergrad/chbe/">Chemical &amp; Biomolecular Engr (CHBE)</a>,
 <a href="/courses-undergrad/chem/">Chemistry (CHEM)</a>,
 <a href="/

In [5]:
# We just need the department codes from the URLs:

school_code = []
for a in school_list:
    school_code.append(a['href'].split('/')[2]) #extract part after the second /

school_code
# Hint: Information in the below link can be helpful
# https://stackoverflow.com/questions/44790295/how-to-extract-partial-text-from-href-using-beautifulsoup-in-python


['',
 '',
 'acct',
 'ae',
 'as',
 'apph',
 'arbc',
 'arch',
 'bios',
 'biol',
 'bmed',
 'bc',
 'cetl',
 'chbe',
 'chem',
 'chin',
 'cp',
 'cee',
 'coa',
 'coe',
 'cos',
 'cx',
 'cs',
 'coop',
 'ucga',
 'eas',
 'econ',
 'ece',
 'engl',
 'fs',
 'free',
 'fren',
 'gt',
 'gtl',
 'grmn',
 'hs',
 'hebw',
 'hin',
 'hist',
 'hts',
 'hum',
 'id',
 'isye',
 'inta',
 'intn',
 'iac',
 'japn',
 'kor',
 'latn',
 'ls',
 'ling',
 'lmc',
 'mgt',
 'mse',
 'math',
 'me',
 'mp',
 'msl',
 'ml',
 'musi',
 'ns',
 'neur',
 'nre',
 'pers',
 'phil',
 'phys',
 'pol',
 'dopp',
 'psyc',
 'pubp',
 'russ',
 'sci',
 'sls',
 'ss',
 'soc',
 'span',
 'swah',
 'courses-undergrad.pdf']

In [6]:
# after manual check,
# delete NAs and 'courses-undergrad.pdf' from school_code

school_code = [code for code in school_code if code and code != 'courses-undergrad.pdf']
school_code

['acct',
 'ae',
 'as',
 'apph',
 'arbc',
 'arch',
 'bios',
 'biol',
 'bmed',
 'bc',
 'cetl',
 'chbe',
 'chem',
 'chin',
 'cp',
 'cee',
 'coa',
 'coe',
 'cos',
 'cx',
 'cs',
 'coop',
 'ucga',
 'eas',
 'econ',
 'ece',
 'engl',
 'fs',
 'free',
 'fren',
 'gt',
 'gtl',
 'grmn',
 'hs',
 'hebw',
 'hin',
 'hist',
 'hts',
 'hum',
 'id',
 'isye',
 'inta',
 'intn',
 'iac',
 'japn',
 'kor',
 'latn',
 'ls',
 'ling',
 'lmc',
 'mgt',
 'mse',
 'math',
 'me',
 'mp',
 'msl',
 'ml',
 'musi',
 'ns',
 'neur',
 'nre',
 'pers',
 'phil',
 'phys',
 'pol',
 'dopp',
 'psyc',
 'pubp',
 'russ',
 'sci',
 'sls',
 'ss',
 'soc',
 'span',
 'swah']

In [7]:
# Insert code that loops through the school_code  and merge with url = "http://www.catalog.gatech.edu/courses-grad/"

school_urls=[]
for school in school_code:
    # Your code here
    school_urls.append('http://www.catalog.gatech.edu/courses-grad/' + school)

school_urls

['http://www.catalog.gatech.edu/courses-grad/acct',
 'http://www.catalog.gatech.edu/courses-grad/ae',
 'http://www.catalog.gatech.edu/courses-grad/as',
 'http://www.catalog.gatech.edu/courses-grad/apph',
 'http://www.catalog.gatech.edu/courses-grad/arbc',
 'http://www.catalog.gatech.edu/courses-grad/arch',
 'http://www.catalog.gatech.edu/courses-grad/bios',
 'http://www.catalog.gatech.edu/courses-grad/biol',
 'http://www.catalog.gatech.edu/courses-grad/bmed',
 'http://www.catalog.gatech.edu/courses-grad/bc',
 'http://www.catalog.gatech.edu/courses-grad/cetl',
 'http://www.catalog.gatech.edu/courses-grad/chbe',
 'http://www.catalog.gatech.edu/courses-grad/chem',
 'http://www.catalog.gatech.edu/courses-grad/chin',
 'http://www.catalog.gatech.edu/courses-grad/cp',
 'http://www.catalog.gatech.edu/courses-grad/cee',
 'http://www.catalog.gatech.edu/courses-grad/coa',
 'http://www.catalog.gatech.edu/courses-grad/coe',
 'http://www.catalog.gatech.edu/courses-grad/cos',
 'http://www.catalog.gat

In [8]:
data = []
for page in school_urls:
    page = requests.get(page)
    soup = BeautifulSoup(page.text, 'html.parser')
    course_list_codes = soup.find_all('strong')


    for course_list in course_list_codes:
        codes = course_list.contents[0]
        data.append((codes))


    with open('index.csv', 'a') as csv_file:
        writer = csv.writer(csv_file)
        for codes in data:
          writer.writerow([codes])


In [9]:
# define keywords related to data science
sub = ['Data Science', 'Big Data', 'Analytics', 'Data Analysis',
       'Machine Learning', 'Visualization']

# Hint: if you run this cell multiple times, remember to delete the existing Relevant_Courses.csv
# file so that it does not duplicate.
for codes in data:
    contains = any(keyword in codes for keyword in sub)
    if contains:
        print(codes)
        with open('Relevant_Courses.csv', 'a') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow([codes])

BMED 6517.  Machine Learning in Biosciences.  3 Credit Hours.
CHBE 6745.  Data Analytics for Chemical Engineers.  3 Credit Hours.
CP 6006.  Visualization for Planners.  1 Credit Hour.
CP 6543.  Public Health Analytics.  3 Credit Hours.
CP 6545.  Climate Change Analytics.  3 Credit Hours.
CP 6960.  Urban Analytics Capstone Project.  1 Credit Hour.
CP 6962.  Urban Analytics Capstone Project.  5 Credit Hours.
CEE 6327.  Statistical Methods for Environmental Data Analysis and Prediction.  3 Credit Hours.
CS 6220.  Big Data Systems and Analytics.  3 Credit Hours.
CS 6480.  Computer Visualization Techniques.  3 Credit Hours.
CS 6485.  Visualization Methods for Science and Engineering.  3 Credit Hours.
CS 6730.  Data Visualization: Principles and Applications.  3 Credit Hours.
CS 7450.  Information Visualization.  3 Credit Hours.
CS 7451.  Human-Centered Data Analysis.  3 Credit Hours.
CS 7545.  Theoretical Foundations of Machine Learning.  3 Credit Hours.
CS 7641.  Machine Learning.  3 Credi

In [10]:
# read in the dataframe using Pandas
df = pd.read_csv("Relevant_Courses.csv", header = None)

In [11]:
df.head(5)

Unnamed: 0,0
0,BMED 6517. Machine Learning in Biosciences. ...
1,CHBE 6745. Data Analytics for Chemical Engine...
2,CP 6006. Visualization for Planners. 1 Credi...
3,CP 6543. Public Health Analytics. 3 Credit H...
4,CP 6545. Climate Change Analytics. 3 Credit ...


In [12]:
# clean the dataset for analysis
cleaned_df = pd.DataFrame(df[0].str.split(".").tolist())
cleaned_df = cleaned_df.rename(columns ={0: "full_course_number", 1: "course_name", 2: "uncleaned_credit_hours"})
cleaned_df['credit_hours'] = cleaned_df["uncleaned_credit_hours"].str.strip().str.split().str[0].astype(int)
cleaned_df['course_code'] = cleaned_df["full_course_number"].str.strip().str.split().str[0]
cleaned_df.head(3)

Unnamed: 0,full_course_number,course_name,uncleaned_credit_hours,3,4,credit_hours,course_code
0,BMED 6517,Machine Learning in Biosciences,3 Credit Hours,,,3,BMED
1,CHBE 6745,Data Analytics for Chemical Engineers,3 Credit Hours,,,3,CHBE
2,CP 6006,Visualization for Planners,1 Credit Hour,,,1,CP
