Web Scraping of University Reviews from Open Online Sources

This project demonstrates how to scrape user-generated reviews from publicly available web platforms using Selenium and BeautifulSoup.

**Key features:**
- Automatically loads and reveals additional content via browser automation
- Parses structured review data including rating, date, and full text
- Saves extracted data to CSV and text files for further processing or analysis

 Required Libraries

Below are the necessary Python libraries for web automation, HTML parsing, and data processing.

In [None]:
# Selenium and WebDriver components for browser automation
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Common exceptions to handle during web scraping
from selenium.common.exceptions import (
    StaleElementReferenceException, NoSuchElementException,
    ElementClickInterceptedException, TimeoutException
)

# Time module to add delays between browser actions
import time

# Pandas for working with tabular data (DataFrames)
import pandas as pd

# Regular expressions for text pattern matching and cleaning
import re

# BeautifulSoup for parsing and navigating HTML pages
from bs4 import BeautifulSoup

In [None]:
# Open the university website and hide distracting advertisements
def open_site_hide_ads(uni_url):
    # Launch a new Chrome browser instance
    driver = webdriver.Chrome()
    
    # Navigate to the provided university URL
    driver.get(uni_url)
    
    # Hide ads (iframes, Google ads, or dynamically loaded ad containers)
    driver.execute_script("""
        document.querySelectorAll('iframe, ins.adsbygoogle, div[id^="aswift_"]').forEach(el => el.style.display = 'none');
    """)
    
    return driver

In [None]:
# --- Function 3: Click the "Show all reviews" button using JavaScript ---
def click_show_all_reviews(driver):
    # Scroll and click the "Show more" button (if present) to load additional reviews
    while True:
        try:
            # Hide ads again in case new ones loaded dynamically
            driver.execute_script("""
                document.querySelectorAll('iframe, ins.adsbygoogle, div[id^="aswift_"]').forEach(el => el.style.display = 'none');
            """)
            
            # Wait until the "Show more" button is clickable
            element = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "a.btn.blue"))
            )
            
            # Click the button and wait for the page to load new content
            element.click()
            time.sleep(2)

        except (StaleElementReferenceException, NoSuchElementException, 
                ElementClickInterceptedException, TimeoutException):
            print("No more 'Show more' button found or an issue occurred. Ending scroll.")

In [None]:
# --- Function 4: Expand all full review texts ---
def read_all_reviews(driver):
    # 1. Click all "Read Review" buttons to expand hidden review content
    try:
        buttons = driver.find_elements(By.XPATH, "//a[contains(text(), 'Читати відгук')]")
        for btn in buttons:
            try:
                # Use JavaScript to click each button in case it's not directly clickable
                driver.execute_script("arguments[0].click();", btn)
                time.sleep(0.5)  # Wait a bit for content to load
            except Exception as e:
                print("Failed to click button:", e)
    except Exception as e:
        print("Failed to find review buttons:", e)

In [None]:
def parse_reviews_full(driver):
    soup = BeautifulSoup(driver.page_source)

    # Extract university name
    uni_name = soup.find('h1', {'class': 'element_name'}).text

    # Extract total number of reviews
    reviews_count_tag = soup.find('div', {'class': 'rtngdescr'})
    reviews_count = int(reviews_count_tag.find('span').text) if reviews_count_tag else "No data"

    # Extract average rating
    count_stars_tag = soup.find('span', {'class': 'average'})
    count_stars = float(count_stars_tag.text) if count_stars_tag else "No data"

    # Prepare to collect individual reviews
    review_lst = []

    # Locate all review blocks on the page
    reviews = soup.find_all("div", class_="comment_row")
    for review in reviews:
        # Extract review date
        date = review.find('span', {'class': 'value-title'})
        date = date['title'] if date else "No date"

        # Try to get full review text, fallback to snippet if not available
        full_text_tag = review.find("span", class_="review-full-text")
        if full_text_tag and full_text_tag.get_text(strip=True):
            full_text = full_text_tag.get_text(strip=True)
        else:
            snippet_tag = review.find("span", class_="review-snippet")
            full_text = snippet_tag.get_text(strip=True) if snippet_tag else "No text"

        # Extract star rating from inline style (e.g., width: 39px → 3 stars)
        stars_tag = review.find('span', {'class': 'star_ring'})
        if stars_tag:
            stars_width = stars_tag.find('span')['style']
            stars_value = int(''.join(filter(str.isdigit, stars_width))) // 13
        else:
            stars_value = "No rating"

        # Combine all extracted data into a dictionary
        review_dict = {
            'University Name': uni_name,
            'Total Reviews': reviews_count,
            'Average Rating': count_stars,
            'Review Date': date,
            'Review Rating': stars_value,
            'Review Text': full_text
        }
        review_lst.append(review_dict)

    # Save review texts to a .txt file for additional reference or inspection
    safe_uni_name = re.sub(r'[\\/*?:"<>|]', "_", uni_name)
    filename = f"{safe_uni_name}_reviews.txt"
    with open(filename, "w", encoding="utf-8") as f:
        for r in review_lst:
            f.write(r["Review Text"] + "\n\n")
    time.sleep(3) # Pause before exiting function
    return review_lst, filename


In [None]:
final_df = pd.DataFrame(columns=['university_name', 'reviews_count', 'average_rating', 'review_date', 'review_rating', 'review_text'])

In [None]:
# List of university URLs to scrape reviews from
links = [
    # "https://example-university1.edu/reviews",
    # "https://example-university2.edu/reviews",
    # Add more university review page URLs here
]

In [None]:
# --- Loop through all university URLs to collect reviews ---
all_reviews = []             # List to store all parsed reviews
saved_files_rewiev = []      # List to store file paths of saved reviews (typo corrected in comment)

for uni_url in links:
    # Open the university page and hide intrusive ads
    driver = open_site_hide_ads(uni_url)

    # Load all reviews by clicking "Show More"
    click_show_all_reviews(driver)

    # Expand full review content
    read_all_reviews(driver)

    # Parse all reviews from the current university page
    reviews, file_path = parse_reviews_full(driver)

    # Add parsed reviews to the full list
    all_reviews.extend(reviews)

    # Save file path to list (note: correct variable name below)
    saved_files_rewiev.append(file_path)

    # Close the browser for this iteration
    driver.quit()

# After gathering all reviews, convert to a pandas DataFrame
final_df = pd.DataFrame(all_reviews)

# Short pause to prevent script from finishing too fast
time.sleep(3)

# Preview the first 30 rows of collected reviews
final_df.head(30)


In [None]:
# Save the collected reviews to a CSV file with UTF-8 encoding (with BOM for better compatibility with Excel)
final_df.to_csv('reviews.csv', index=False, encoding='utf-8-sig')