# Demo: Web Scraping and Data Preprocessing with the GT Catalog

### Automate collecting GT course catalog website URLs for all the schools.

In [16]:
# load required packages
import requests # grab the content from the websites
from bs4 import BeautifulSoup #https://www.crummy.com/software/BeautifulSoup/bs4/doc/
import csv
import pandas as pd

In [17]:
url = "https://catalog.gatech.edu/courses-grad/" # undergraduate level courses

page = requests.get(url) # fetch the page

soup = BeautifulSoup(page.text, 'html.parser') # parse the HTML content

In [18]:
# Webpages are structured using HTML, and links are typically stored in <a> (anchor) tags.

school_list = []
for item in soup.findAll('a'): # find all anchor (<a>) tags
    try:
        if '/courses-grad/' in item['href']: # check "/courses-undergrad/" is inside "href"
            school_list.append(item)
    except KeyError:   # skip any missing 'href' attributes
        pass

school_list

  for item in soup.findAll('a'): # find all anchor (<a>) tags


[<a href="/courses-grad/">Graduate-​Level Courses</a>,
 <a href="/courses-grad/ae/">Aerospace Engineering (AE)</a>,
 <a href="/courses-grad/apph/">Applied Physiology (APPH)</a>,
 <a href="/courses-grad/ase/">Applied Systems Engineering (ASE)</a>,
 <a href="/courses-grad/arch/">Architecture (ARCH)</a>,
 <a href="/courses-grad/bios/">Biological Sciences (BIOS)</a>,
 <a href="/courses-grad/biol/">Biology (BIOL)</a>,
 <a href="/courses-grad/bmej/">Biomed Engr/Joint Emory PKU (BMEJ)</a>,
 <a href="/courses-grad/bmed/">Biomedical Engineering (BMED)</a>,
 <a href="/courses-grad/bmem/">Biomedical Engr/Joint Emory (BMEM)</a>,
 <a href="/courses-grad/bc/">Building Construction (BC)</a>,
 <a href="/courses-grad/bcp/">Building Construction - Professional (BCP)</a>,
 <a href="/courses-grad/cetl/">Center Enhancement-Teach/Learn (CETL)</a>,
 <a href="/courses-grad/chbe/">Chemical &amp; Biomolecular Engr (CHBE)</a>,
 <a href="/courses-grad/chem/">Chemistry (CHEM)</a>,
 <a href="/courses-grad/chin/">Ch

In [19]:
# We just need the department codes from the URLs:

school_code = []
for a in school_list:
    school_code.append(a['href'].split('/')[2]) #extract part after the second /

school_code
# Hint: Information in the below link can be helpful
# https://stackoverflow.com/questions/44790295/how-to-extract-partial-text-from-href-using-beautifulsoup-in-python


['',
 'ae',
 'apph',
 'ase',
 'arch',
 'bios',
 'biol',
 'bmej',
 'bmed',
 'bmem',
 'bc',
 'bcp',
 'cetl',
 'chbe',
 'chem',
 'chin',
 'cp',
 'cee',
 'coa',
 'cse',
 'cs',
 'coop',
 'ucga',
 'eas',
 'econ',
 'ecep',
 'ece',
 'fs',
 'free',
 'fren',
 'gtl',
 'grmn',
 'hs',
 'hts',
 'id',
 'isye',
 'inta',
 'il',
 'imba',
 'japn',
 'kor',
 'ling',
 'lmc',
 'mgt',
 'mot',
 'mldr',
 'mse',
 'math',
 'me',
 'mp',
 'ml',
 'musi',
 'nre',
 'phil',
 'phys',
 'ptfe',
 'psyc',
 'pubj',
 'pubp',
 'russ',
 'span',
 'courses-grad.pdf']

In [20]:
# after manual check,
# delete NAs and 'courses-undergrad.pdf' from school_code

school_code = [code for code in school_code if code and code != 'courses-grad.pdf']
school_code

['ae',
 'apph',
 'ase',
 'arch',
 'bios',
 'biol',
 'bmej',
 'bmed',
 'bmem',
 'bc',
 'bcp',
 'cetl',
 'chbe',
 'chem',
 'chin',
 'cp',
 'cee',
 'coa',
 'cse',
 'cs',
 'coop',
 'ucga',
 'eas',
 'econ',
 'ecep',
 'ece',
 'fs',
 'free',
 'fren',
 'gtl',
 'grmn',
 'hs',
 'hts',
 'id',
 'isye',
 'inta',
 'il',
 'imba',
 'japn',
 'kor',
 'ling',
 'lmc',
 'mgt',
 'mot',
 'mldr',
 'mse',
 'math',
 'me',
 'mp',
 'ml',
 'musi',
 'nre',
 'phil',
 'phys',
 'ptfe',
 'psyc',
 'pubj',
 'pubp',
 'russ',
 'span']

In [21]:
# Insert code that loops through the school_code  and merge with url = "http://www.catalog.gatech.edu/courses-grad/"

school_urls=[]
for school in school_code:
    # Your code here
    school_urls.append('http://www.catalog.gatech.edu/courses-grad/' + school)

school_urls

['http://www.catalog.gatech.edu/courses-grad/ae',
 'http://www.catalog.gatech.edu/courses-grad/apph',
 'http://www.catalog.gatech.edu/courses-grad/ase',
 'http://www.catalog.gatech.edu/courses-grad/arch',
 'http://www.catalog.gatech.edu/courses-grad/bios',
 'http://www.catalog.gatech.edu/courses-grad/biol',
 'http://www.catalog.gatech.edu/courses-grad/bmej',
 'http://www.catalog.gatech.edu/courses-grad/bmed',
 'http://www.catalog.gatech.edu/courses-grad/bmem',
 'http://www.catalog.gatech.edu/courses-grad/bc',
 'http://www.catalog.gatech.edu/courses-grad/bcp',
 'http://www.catalog.gatech.edu/courses-grad/cetl',
 'http://www.catalog.gatech.edu/courses-grad/chbe',
 'http://www.catalog.gatech.edu/courses-grad/chem',
 'http://www.catalog.gatech.edu/courses-grad/chin',
 'http://www.catalog.gatech.edu/courses-grad/cp',
 'http://www.catalog.gatech.edu/courses-grad/cee',
 'http://www.catalog.gatech.edu/courses-grad/coa',
 'http://www.catalog.gatech.edu/courses-grad/cse',
 'http://www.catalog.ga

In [22]:
data = []
for page in school_urls:
    page = requests.get(page)
    soup = BeautifulSoup(page.text, 'html.parser')
    course_list_codes = soup.find_all('strong')


    for course_list in course_list_codes:
        codes = course_list.contents[0]
        data.append((codes))


    with open('index.csv', 'a') as csv_file:
        writer = csv.writer(csv_file)
        for codes in data:
          writer.writerow([codes])


In [23]:
# define keywords related to data science
sub = ['Data Science', 'Big Data', 'Analytics', 'Data Analysis',
       'Machine Learning', 'Visualization', 'Natural Language Processing', 'Artificial Intelligence']

# Hint: if you run this cell multiple times, remember to delete the existing Relevant_Courses.csv
# file so that it does not duplicate.
for codes in data:
    contains = any(keyword in codes for keyword in sub)
    if contains:
        print(codes)
        with open('Relevant_Courses.csv', 'a') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow([codes])

BMED 6517.  Machine Learning in Biosciences.  3 Credit Hours.
CHBE 6745.  Data Analytics for Chemical Engineers.  3 Credit Hours.
CP 6006.  Visualization for Planners.  1 Credit Hour.
CP 6543.  Public Health Analytics.  3 Credit Hours.
CP 6545.  Climate Change Analytics.  3 Credit Hours.
CP 6960.  Urban Analytics Capstone Project.  1 Credit Hour.
CP 6962.  Urban Analytics Capstone Project.  5 Credit Hours.
CEE 6327.  Statistical Methods for Environmental Data Analysis and Prediction.  3 Credit Hours.
CSE 6040.  Computing for Data Analysis: Methods and Tools.  3 Credit Hours.
CSE 6242.  Data and Visual Analytics.  3 Credit Hours.
CSE 6243.  Advanced Topics in Machine Learning.  3 Credit Hours.
CSE 6250.  Big Data Analytics for Healthcare.  3 Credit Hours.
CSE 6740.  Computational Data Analysis: Learning, Mining, and Computation.  3 Credit Hours.
CSE 6748.  Applied Analytics Practicum.  6 Credit Hours.
CSE 7750.  Mathematical Foundations of Machine Learning.  3 Credit Hours.
CSE 7751.  P

In [24]:
# read in the dataframe using Pandas
df = pd.read_csv("Relevant_Courses.csv", header = None)

In [25]:
df.head(5)

Unnamed: 0,0
0,BMED 6517. Machine Learning in Biosciences. ...
1,CHBE 6745. Data Analytics for Chemical Engine...
2,CP 6006. Visualization for Planners. 1 Credi...
3,CP 6543. Public Health Analytics. 3 Credit H...
4,CP 6545. Climate Change Analytics. 3 Credit ...


In [26]:
# clean the dataset for analysis
cleaned_df = pd.DataFrame(df[0].str.split(".").tolist())
cleaned_df = cleaned_df.rename(columns ={0: "full_course_number", 1: "course_name", 2: "uncleaned_credit_hours"})
cleaned_df['credit_hours'] = cleaned_df["uncleaned_credit_hours"].str.strip().str.split().str[0].astype(int)
cleaned_df['course_code'] = cleaned_df["full_course_number"].str.strip().str.split().str[0]
cleaned_df.head(3)

Unnamed: 0,full_course_number,course_name,uncleaned_credit_hours,3,4,credit_hours,course_code
0,BMED 6517,Machine Learning in Biosciences,3 Credit Hours,,,3,BMED
1,CHBE 6745,Data Analytics for Chemical Engineers,3 Credit Hours,,,3,CHBE
2,CP 6006,Visualization for Planners,1 Credit Hour,,,1,CP


In [27]:
management_classes = cleaned_df[cleaned_df['course_code'] == "MGT"]
intra_classes = cleaned_df[cleaned_df['course_code'] == "INTA"]
econ_classes = cleaned_df[cleaned_df['course_code'] == "ECON"]
pubp_classes = cleaned_df[cleaned_df['course_code'] == "PUBP"]

total_ssc = management_classes.shape[0] + intra_classes.shape[0] + econ_classes.shape[0] + pubp_classes.shape[0]
total = cleaned_df.shape[0]

percent = total_ssc / total * 100

print(f"Of all the classes offered, {percent}% are social sciences.")


Of all the classes offered, 26.277372262773724% are social sciences.
