### Import Libraries

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

### Scrape UCD Professional Academy website

In [None]:
URL = "https://www.ucd.ie/professionalacademy/"
r = requests.get(URL)
soup = BeautifulSoup(r.content, 'html5lib')

# Read website source code
print(soup.prettify())

# Extract website footer information
table = soup.findAll('a', attrs = {'class':"creative_footer__link"})
print(table)

# Extract courses URLs
subject_areas = []
for item in table:
    items = item['href']
    if items.find("findyourcourse") > 0: 
      subject_areas.append(items)

subject_areas = subject_areas[2:]

# Create a list with all courses URLs
subject_areas_urls = []
for sa in subject_areas:
    new_sa = sa.replace("/professionalacademy/", "")
    subject_areas_urls.append(URL+new_sa)

print(subject_areas_urls)

### Scrape courses details

In [5]:
# Create an empty pandas DataFrame
courses_detailed = pd.DataFrame(columns = ["Type","Course", "Info", "Duration", "Fee"])

# Extract courses details
for url in subject_areas_urls:
  URL = url
  r = requests.get(URL)
  soup = BeautifulSoup(r.content, 'html5lib')
  courses_grid = soup.findAll('div', attrs = {"class":"block_category_courses__grid"})

# For each item in the grid, extract main information
  for course in courses_grid:
    # Course Type
    course_type = course.p.string

    # Course Name
    course_name = course.h2.a.string

    # Course Information
    info = []
    for information in course.findAll('span'):
      info.append(information.string)

    # Course Duration
    duration = course.find('p', attrs = {"class":"block_category_courses__duration"}).get_text()

    # Course Fee
    fee_list = course.find('p', attrs = {"class":"block_category_courses__price"}).text.split()
    if fee_list[0] == "from":
      fee = ' '.join(fee_list)
    else:
      fee = fee_list[0]

    # Extracted Course Dictionary and DataFrame
    data = {"Type": course_type,
            "Course": course_name,
            "Info": [info],
            "Duration" :duration,
            "Fee" : fee}

    df = pd.DataFrame(data)

    # Union course detail extract and the main DataFrame
    courses_detailed = pd.concat([courses_detailed, df])

### Remove Duplicates

In [6]:
# Check DataFrame shape before removing duplicates
print(courses_detailed.shape)

# Remove duplicates and check new shape
courses_detailed.drop_duplicates(subset=['Course'],inplace=True)
print(courses_detailed.shape)

(54, 5)
(38, 5)


### Export scrapped data

In [7]:
# Save DataFrame as .csv on same path as the Jupyter Notebook
courses_detailed.to_csv(index=False)

# Visualize the exported DataFrame
print(courses_detailed)

                               Type                                Course  \
0              Professional Diploma               Leadership & Management   
0              Professional Diploma                    Project Management   
0  Professional Academy Certificate              Business Coaching Skills   
0              Professional Diploma      Effective Business Communication   
0              Professional Diploma                         HR Management   
0                Specialist Diploma            HR Management - Specialist   
0              Professional Diploma    Finance for Non-Financial Managers   
0              Professional Diploma               Supply Chain Management   
0              Professional Diploma                Performance Management   
0              Professional Diploma                     Change Management   
0              Professional Diploma     Digital Transformation Management   
0              Professional Diploma                      Sales Management   