In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [10]:
def scrape_degree_info(url):
    # Fetch the HTML content of the page
    response = requests.get(url)
    content = response.content
    soup = BeautifulSoup(content, 'html.parser')

    abstract_values = {}


    # Extract and Storing Degree name
    degree_name = soup.find('h1', class_='pageTitle').text.strip()
    abstract_values['Degree Name'] = degree_name


    # Extract and Storing Degree type
    degree_type = soup.find('li', class_='keyDetails__item--grad').text.strip()
    abstract_values['Degree type (undergraduate/postgraduate)'] = degree_type


    # Extract and Storing Academic Year, Application Deadline, Duration of the degree and Overseas student fees
    def extract_abstract_values(soup):
        # Define patterns for each abstract value with reg ex
        patterns = {
            'Academic year': r'Academic year\s*\((\d{4}/\d{2})\)\s*([\w\s]+)',
            'Application deadline': r'Application deadline\s*([\w\s]+\d{4})',
            'Duration of the degree': r'Duration\s*([\w\s-]+)',
            'Overseas student fees (£)': r'Overseas students:\s*The \d{4} tuition fee for international students is £([\d,]+)\.',
        }

        # Selecting all content elements
        content_elements = soup.select('section.accordion div.accordion__content')

        # Stripping text from each content element and storing it in a list
        all_content_texts = []
        for content in content_elements:
            all_content_texts.append(content.text.strip())

        # Converting list to string
        all_content_str = ' '.join(all_content_texts)

        # Extract abstract values with reg ex
        for key, pattern in patterns.items():
            match = re.search(pattern, all_content_str)
            if match:
                abstract_values[key] = match.group(1).strip()
            else:
                abstract_values[key] = None

        return abstract_values

    abstract_values = extract_abstract_values(soup)


    # Function to extract Entry Requirement
    def extract_entry_req(soup):
    # Find tag containing the text "Entry requirements"
      entry_requirements_title = soup.find('h1', class_='accordion__title', string='Entry requirements')
      accordion_contents = []

      # If the entry requirements title is found, find its parent section and then find all accordion__content divs within it
      if entry_requirements_title:
          entry_requirements_section = entry_requirements_title.find_parent('section', class_='accordion')
          if entry_requirements_section:
              accordion_contents = entry_requirements_section.find_all('div', class_='accordion__content')

      # Extract text from each accordion content
      for accordion_content in accordion_contents:
          text = accordion_content.get_text(separator='\n', strip=True)

      return text


    # Function to extract Brief degree description
    def extract_summary(soup):
        course_summary_div = soup.find('div', class_='courseSummary')
        if course_summary_div:
            return course_summary_div.get_text(separator='\n', strip=True)
        else:
            return "No course summary found."

    # Extracting entry requirement and summary text
    entry_req = extract_entry_req(soup)
    summary_text = extract_summary(soup)

    # Storing Degree Description
    abstract_values['Admission eligibility/entry requirements'] = entry_req
    abstract_values['Brief degree description'] = summary_text

    return abstract_values


In [11]:
url = "https://www.lse.ac.uk/study-at-lse/Undergraduate/degree-programmes-2024/BA-History"
degree_details = scrape_degree_info(url)
print(degree_details)

{'Degree Name': 'BA History', 'Degree type (undergraduate/postgraduate)': 'Undergraduate', 'Academic year': '2024/25', 'Application deadline': '31 January 2024', 'Duration of the degree': 'Three years full-time\n\nApplications', 'Overseas student fees (£)': '26,184', 'Admission eligibility/entry requirements': "Below we list our entry requirements in terms of GCSEs, A-Levels (the entry requirements should be read alongside our A-level subject combinations information) and the International Baccalaureate (IB) Diploma. We accept a wide range of other\nqualifications from the UK\nand from\noverseas\n.\nGCSEs\nA strong set of GCSE grades including the majority at A (or 7) and A* (or 8-9)\nGCSE (or equivalent) English Language and Mathematics grades should be no lower than B (or 6)\nWe also consider your overall GCSE subject profile\nA-levels*\nAAA\nWe also consider your AS grades, if available\nContextual admissions A-level grades**\nAAB\nIB Diploma\n38 points overall. 766 at higher level\

In [12]:
df = pd.DataFrame([degree_details])
df.to_excel("output.xlsx")
df

Unnamed: 0,Degree Name,Degree type (undergraduate/postgraduate),Academic year,Application deadline,Duration of the degree,Overseas student fees (£),Admission eligibility/entry requirements,Brief degree description
0,BA History,Undergraduate,2024/25,31 January 2024,Three years full-time\n\nApplications,26184,Below we list our entry requirements in terms ...,BA History at LSE is a modern international hi...


In [13]:
test = "https://www.lse.ac.uk/study-at-lse/Undergraduate/degree-programmes-2024/BSc-Data-Science"
degree_details = scrape_degree_info(test)
df = pd.DataFrame([degree_details])
df

Unnamed: 0,Degree Name,Degree type (undergraduate/postgraduate),Academic year,Application deadline,Duration of the degree,Overseas student fees (£),Admission eligibility/entry requirements,Brief degree description
0,BSc Data Science,Undergraduate,2024/25,31 January 2024,Three years full-time\n\nApplications,27192,Below we list our entry requirements in terms ...,BSc Data Science\nDiscover the BSc Data Scienc...


In [14]:
df = pd.DataFrame([degree_details])
df

Unnamed: 0,Degree Name,Degree type (undergraduate/postgraduate),Academic year,Application deadline,Duration of the degree,Overseas student fees (£),Admission eligibility/entry requirements,Brief degree description
0,BSc Data Science,Undergraduate,2024/25,31 January 2024,Three years full-time\n\nApplications,27192,Below we list our entry requirements in terms ...,BSc Data Science\nDiscover the BSc Data Scienc...


In [None]:
df.to_excel("output.xlsx")