# Link Extraction

In [28]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

# URL of the page to scrape
url = 'https://www.mtroyal.ca/Applications/ProgramsOffered/'

# Set up Selenium WebDriver (ensure chromedriver is installed and in PATH)
driver = webdriver.Chrome()

# Open the webpage with Selenium
driver.get(url)

# Wait for the page content to load (adjust timeout if necessary)
wait = WebDriverWait(driver, 20)
wait.until(
    EC.presence_of_element_located((By.CLASS_NAME, 'colHeadersRow'))
)

# Function to extract program information from a page
def extract_programs_from_page(driver):
    # Get the page source and parse with BeautifulSoup
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')

    # Initialize a list to store program data
    programs = []

    # Find all table rows with program details
    rows = soup.find_all('tr', class_=['odd', 'even'])

    # Extract program information from each row
    for row in rows:
        title_td = row.find('td')
        title = title_td.get_text(strip=True)
        link = title_td.find('a')['href']
        full_link = f"{link}"

        # Store the program information in a dictionary
        programs.append({
            'Program_Title': title,
            'Link': full_link
        })
    
    return programs

# Extract programs from the first page
all_programs = extract_programs_from_page(driver)

# Close the Selenium WebDriver
driver.quit()

# Print the extracted program data
for program in all_programs:
    print(f"Program Title: {program['Program_Title']}")
    print(f"Link: {program['Link']}\n")


Program Title: Bachelor of Arts - Anthropology
Link: https://www.mtroyal.ca/ProgramsCourses/FacultiesSchoolsCentres/Arts/Departments/SociologyAnthropology/AnthropologyProgram/index.htm

Program Title: Bachelor of Arts - Criminal Justice
Link: http://www.mtroyal.ca/ProgramsCourses/FacultiesSchoolsCentres/Arts/Departments/EconomicsJusticePolicyStudies/DegreePrograms/BachelorofArts-CriminalJustice/index.htm

Program Title: Bachelor of Arts - English
Link: https://www.mtroyal.ca/ProgramsCourses/FacultiesSchoolsCentres/Arts/Departments/EnglishLanguagesCultures/BachelorOfArtsEnglish/index.htm

Program Title: Bachelor of Arts - History
Link: https://www.mtroyal.ca/ProgramsCourses/FacultiesSchoolsCentres/Arts/Departments/Humanities/program-areas/history/index.htm

Program Title: Bachelor of Arts - Policy Studies
Link: https://www.mtroyal.ca/ProgramsCourses/FacultiesSchoolsCentres/Arts/Departments/EconomicsJusticePolicyStudies/DegreePrograms/BachelorofArts-PolicyStudies/index.htm

Program Title

In [29]:
len(all_programs)

59

In [30]:
import pandas as pd
import csv

# Specify the CSV file name
csv_file = 'Mount_Royal_university_link.csv'

# Writing data to the CSV file
with open(csv_file, mode='w', newline='') as file:
    # Create a CSV writer object
    writer = csv.DictWriter(file, fieldnames=all_programs[0].keys())

    # Write the header (column names)
    writer.writeheader()

    # Write the rows (data from the dictionaries)
    writer.writerows(all_programs)


In [31]:
import pandas as pd

df=pd.read_csv('Mount_Royal_university_link.csv')

In [32]:
df.count()

Program_Title    59
Link             59
dtype: int64

In [33]:
df.isnull().sum()

Program_Title    0
Link             0
dtype: int64

In [16]:
import requests
from bs4 import BeautifulSoup

# URL of the page
url = 'https://www.mtroyal.ca/ProgramsCourses/FacultiesSchoolsCentres/Arts/Departments/SociologyAnthropology/AnthropologyProgram/index.htm'

# Send a request to fetch the page content
response = requests.get(url)

# Parse the page content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Find the main content div with class 'wrapper'
main_div = soup.find('div', {'class': 'wrapper'})

if main_div:
    # Remove any header or footer elements inside the main div (if they exist)
    for tag in main_div.find_all(['header', 'footer']):
        tag.decompose()  # Remove these tags and their content

    # Extract the text from the cleaned main div
    text = main_div.get_text(separator='\n', strip=True)
    print(text)

    # Save the extracted text to a file
    with open("testing_file.txt", "w", encoding="utf-8") as file:
        file.write(text)

else:
    print("Div with class 'wrapper' not found.")


Toggle navigation
Programs
& Courses
Programs & Courses Home
Programs Offered
Collaborative Degrees
International Designated Programs
Continuing Education
The Conservatory
Faculties/Schools/Centres
General Education
Community Service Learning
Course Listings
About
Fast Facts
Equity, Diversity and Inclusion
Indigenization and Decolonization
Institutional Research & Planning
Teaching & Learning
Giving to Mount Royal
Alumni
Media Room
Marketing & Communications
Governance & Leadership
Explore campus
Parking & Transportation
Sustainable MRU
Apply
Admission
Admission Home
Apply
Admission Requirements
Transcripts for Admission
Viewbook
Dates & Deadlines
Financing Your Education
Guidance Counsellors
Campus Tours
Open House
Admission Information Sessions
Contact Us
Academic
Support
Academic Support Home
Academic Calendar
Advising
Resources & Services
Office of the Registrar
Campus
Services
Campus Services Home
A-Z Services
Getting Involved
Recreation
Cougar Athletics
Campus Resources
Facilitie

In [21]:

# Open and read the text file
with open('testing_file.txt', 'r', encoding='utf-8') as file:
    # Store the entire content in a variable
    file_content = file.read()

# Search for the phrase "200 and 300-level" in the content
search_phrase = "recruitment specialist"

# Check if the phrase exists in the file content
if search_phrase in file_content:
    print(f"The phrase '{search_phrase}' was found in the file.")
else:
    print(f"The phrase '{search_phrase}' was NOT found in the file.")



The phrase 'recruitment specialist' was found in the file.


In [34]:
import pandas as pd

df=pd.read_csv('Mount_Royal_university_link.csv')
df.iloc[1,1]

'http://www.mtroyal.ca/ProgramsCourses/FacultiesSchoolsCentres/Arts/Departments/EconomicsJusticePolicyStudies/DegreePrograms/BachelorofArts-CriminalJustice/index.htm'

In [36]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

# Create a directory to store the text files
os.makedirs("Mount_Royal_University/", exist_ok=True)

# Iterate through each row in the DataFrame
for index, row in df.iterrows():
    degree_name = row['Program_Title']
    url = row['Link']

    # Send a GET request to fetch the HTML content of the webpage
    response = requests.get(url)

    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

         # Find the div with specific id and class
        body_container =  soup.find('div', {'class': 'wrapper'})

        if body_container:
            # Remove any header or footer elements inside the main div (if they exist)
            for tag in body_container.find_all(['header', 'footer']):
                tag.decompose()  # Remove these tags and their content
            # Extract all text from the main content
            text_content = body_container.get_text(separator='\n', strip=True)

            # Create a valid filename by replacing invalid characters
            filename = f"Mount_Royal_University/{degree_name.replace('/', '_').replace(' ', '_')}.txt"

            # Save the link and text content to the file
            with open(filename, "w", encoding="utf-8") as file:
                # Write the link at the top
                file.write(f"Link: {url}\n\n")
                # Write the extracted content below the link
                file.write(text_content)

            print(f"Content successfully saved to '{filename}'.")
        else:
            print(f"Main content not found for {degree_name}.")
    else:
        print(f"Failed to retrieve the webpage for {degree_name}. Status code: {response.status_code}")


Content successfully saved to 'Mount_Royal_University/Bachelor_of_Arts_-_Anthropology.txt'.
Content successfully saved to 'Mount_Royal_University/Bachelor_of_Arts_-_Criminal_Justice.txt'.
Content successfully saved to 'Mount_Royal_University/Bachelor_of_Arts_-_English.txt'.
Content successfully saved to 'Mount_Royal_University/Bachelor_of_Arts_-_History.txt'.
Content successfully saved to 'Mount_Royal_University/Bachelor_of_Arts_-_Policy_Studies.txt'.
Content successfully saved to 'Mount_Royal_University/Bachelor_of_Arts_-_Psychology.txt'.
Content successfully saved to 'Mount_Royal_University/Bachelor_of_Arts_-_Sociology.txt'.
Content successfully saved to 'Mount_Royal_University/Bachelor_of_Arts_-_Undeclared.txt'.
Content successfully saved to 'Mount_Royal_University/Bachelor_of_Aviation_Management.txt'.
Content successfully saved to 'Mount_Royal_University/Bachelor_of_Business_Administration_-_Accounting.txt'.
Content successfully saved to 'Mount_Royal_University/Bachelor_of_Business