In [9]:
import json
import requests
from bs4 import BeautifulSoup

# List of URLs to scrape
urls = [
    "https://www.wlv.ac.uk/university-life/student-life/",
    "https://www.wlv.ac.uk/current-students/student-support/student-support-and-wellbeing-ssw/advice-for-students-with-disabilities-and-specific-learning-disabilities/i-am-a-current-student/",
    "https://www.wlv.ac.uk/current-students/student-support/mental-health-and-wellbeing-advice/",
    "https://www.wlv.ac.uk/current-students/student-support/support-to-study-/",
    "https://www.wlv.ac.uk/current-students/student-support/mental-health-and-wellbeing-advice/i-need-help-now/",
]

# Function to scrape a single URL
def scrape_url(url, session):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    response = session.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        elements = soup.find_all(['h2', 'p'])
        qa_pairs = []
        current_heading = None
        current_paragraphs = []
        for element in elements:
            if element.name == 'h2':
                if current_heading and current_paragraphs:
                    qa_pairs.append({
                        "question": current_heading,
                        "answer": " ".join(current_paragraphs).strip()
                    })
                current_heading = element.text.strip()
                current_paragraphs = []
            elif element.name == 'p' and current_heading:
                current_paragraphs.append(element.text.strip())
        if current_heading and current_paragraphs:
            qa_pairs.append({
                "question": current_heading,
                "answer": " ".join(current_paragraphs).strip()
            })
        return qa_pairs
    else:
        print(f"Failed to retrieve {url}. Status code: {response.status_code}")
        return []

# Load existing scraped data (if any)
try:
    with open("scraped_data.json", "r", encoding="utf-8") as file:
        all_scraped_data = json.load(file)
except FileNotFoundError:
    all_scraped_data = []

# Create a session
session = requests.Session()

# Scrape each URL and append the results
for url in urls:
    print(f"Scraping {url}...")
    scraped_data = scrape_url(url, session)
    all_scraped_data.extend(scraped_data)

# Filter out irrelevant pairs
irrelevant_keywords = ["Read More", "WLV News", "Click here", "Read more"]
filtered_pairs = []
for pair in all_scraped_data:
    if (pair['answer'] and  # Ensure the answer is not empty
        len(pair['answer'].split()) > 5 and  # Ensure the answer has at least 5 words
        not any(keyword in pair['answer'] for keyword in irrelevant_keywords)):  # Exclude irrelevant pairs
        filtered_pairs.append(pair)

# Remove duplicate Q&A pairs
unique_pairs = []
seen = set()
for pair in filtered_pairs:
    pair_key = (pair['question'], pair['answer'])
    if pair_key not in seen:
        unique_pairs.append(pair)
        seen.add(pair_key)

# Save unique pairs to a JSON file
with open("scraped_data.json", "w", encoding="utf-8") as file:
    json.dump(unique_pairs, file, indent=4)

print("Scraped data saved to scraped_data.json.")

Scraping https://www.wlv.ac.uk/university-life/student-life/...
Scraping https://www.wlv.ac.uk/current-students/student-support/student-support-and-wellbeing-ssw/advice-for-students-with-disabilities-and-specific-learning-disabilities/i-am-a-current-student/...
Scraping https://www.wlv.ac.uk/current-students/student-support/mental-health-and-wellbeing-advice/...
Scraping https://www.wlv.ac.uk/current-students/student-support/support-to-study-/...
Scraping https://www.wlv.ac.uk/current-students/student-support/mental-health-and-wellbeing-advice/i-need-help-now/...
Scraped data saved to scraped_data.json.
