# Scraping the text

In [42]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time

visitedLinks = set()

In [43]:
def CrawlWebsite(url, baseUrl, maxDepth=1, currentDepth=0):
    if currentDepth > maxDepth or url in visitedLinks:
        return

    visitedLinks.add(url)

    # Get the webpage content
    try:
        response = requests.get(url)
        content_type = response.headers.get('content-type')

        if 'application/pdf' not in content_type:
            soup = BeautifulSoup(response.text, 'html.parser')

            # Extract text from the page
            pageText = soup.get_text(separator=' ', strip=True) #could use (separator=' ', strip=True) to have already parsed text
            print(f"URL: {url}")
            #print(f"Text:\n{pageText[:500]}...")  # Display first 500 characters
            with open('Dumps/restaurant.txt', 'a') as f:
              f.write(pageText)
              f.close()
            print("\n\n")

            # Find all internal links
            for link in soup.find_all('a', href=True):
                href = link['href']
                parsedHref = urlparse(href)

                # Check if the link is an internal link
                if not parsedHref.netloc or parsedHref.netloc == urlparse(baseUrl).netloc:
                    newUrl = urljoin(baseUrl, href)
                    CrawlWebsite(newUrl, baseUrl, maxDepth, currentDepth + 1)

    except requests.exceptions.RequestException as e:
        print(f"Failed to retrieve {url}: {e}")

    # Pause to avoid overwhelming the server
    time.sleep(0.1)

In [44]:
#startUrl = "https://www.pacificpinestavern.com.au/"
startUrl = "https://clubhelensvale.com.au/"
baseUrl = startUrl

CrawlWebsite(startUrl, baseUrl)

URL: https://clubhelensvale.com.au/



URL: https://clubhelensvale.com.au



Failed to retrieve tel:0755731491: No connection adapters were found for 'tel:0755731491'
URL: https://clubhelensvale.com.au/eat-drink/



URL: https://clubhelensvale.com.au/play/



URL: https://clubhelensvale.com.au/promotions/category/promotional-events



URL: https://clubhelensvale.com.au/live-entertainment-page/



URL: https://clubhelensvale.com.au/events/



URL: https://clubhelensvale.com.au/functions-and-events/



URL: https://clubhelensvale.com.au/bowls/



URL: https://clubhelensvale.com.au/bowls/calendar/



URL: https://clubhelensvale.com.au/social-bowls/



URL: https://clubhelensvale.com.au/mens-championships/



URL: https://clubhelensvale.com.au/womens-championships/



URL: https://clubhelensvale.com.au/club-championships-mixed/



URL: https://clubhelensvale.com.au/bowls/barefoot-bowls/



URL: https://clubhelensvale.com.au/?page_id=321150



URL: https://clubhelensvale.com.au/bowls-galler

# Generating Word Embedding and Topic Modelling

In [45]:
import spacy
nlp = spacy.load('en_core_web_md')

In [46]:
with open('Dumps/restaurant.txt', "rt") as f:
    text = f.read()

print(text)

Our location 07 5573 1491 Home Eat + Drink Play What’s On Live Entertainment Events Functions Bowls Bowls Info Calendar Social Bowls & Events Club Championships – Men Club Championships – Women Club Championships – Mixed Barefoot Bowls Jack Attack Bowls Gallery About Membership Our Community Our Partners Advertising Careers Club Services & Facilities Entry Requirements Future Building Projects Our Board of Directors Contact Us Home Eat + Drink Play What’s On What’s On Live Entertainment Events Bowls Bowls Calendar Social Bowls & Events Bowls Info Barefoot Bowls Jack Attack Bowls Gallery About About Membership Our Community Our Partners Advertising Careers Club Services & Facilities Entry Requirements Future Building Projects Our Board of Directors Contact Us 07 5573 1491 Video slide your place to eat. drink. play The award winning Club Helensvale is your place to be on the Gold Coast for dining, free live music, gaming, competitive lawn bowls & barefoot bowls. Make Club Helensvale your

In [47]:
doc = nlp(text)

print(len(doc.ents))
for ent in doc.ents:
    print(ent.text, ent.label_)

4000
07 5573 DATE
Club Helensvale ORG
the Gold Coast LOC
Blu Bistro ORG
Café Discovery PRODUCT
Bowls EVENT
September 10, 2024 DATE
Parmy Night ORG
5 CARDINAL
just $17.90 MONEY
Wednesday DATE
Wednesday DATE
September 11, 2024 DATE
mid week DATE
Blu Bistro ORG
each week DATE
Wednesday DATE
night TIME
Variety ORG
Steak Night Steak Night WORK_OF_ART
September 12, 2024 DATE
Steak Night WORK_OF_ART
most popular night of the week TIME
Thursday DATE
as little as $18 MONEY
250g QUANTITY
18 MONEY
20 MONEY
300 CARDINAL
25 MONEY
Parmy Night Parmy Night ORG
September 17, 2024 DATE
Parmy Night ORG
5 CARDINAL
just $17.90 MONEY
Wednesday DATE
Wednesday DATE
September 18, 2024 DATE
mid week DATE
Blu Bistro ORG
each week DATE
Wednesday DATE
night TIME
Variety ORG
Steak Night Steak Night WORK_OF_ART
September 19, 2024 DATE
Steak Night WORK_OF_ART
most popular night of the week TIME
Thursday DATE
as little as $18 MONEY
250g QUANTITY
18 MONEY
20 MONEY
300 CARDINAL
25 MONEY
Parmy Night Parmy Night ORG
Septe

In [48]:
from spacy import displacy

displacy.render(doc, style="ent")

In [53]:
# Remove the duplicates (significantly reduces number of tokens to process)
entities = set((ent.text, ent.label_) for ent in doc.ents)

print(len(entities))
for entity in entities:
    print(entity[0], entity[1])
    #print(entity)

803
1pm TIME
5.00 MONEY
Commonwealth Games EVENT
World Champions EVENT
September 7 DATE
9:00 am Vet’s Singles TIME
Event Series Barefoot Bowls EVENT
6 1 CARDINAL
the Punt Sports Bar ORG
Social Membership Fees ORG
October 6 October 6 DATE
26 9:00 am - 9:00 pm TIME
second ORDINAL
Wayne Moffatt PERSON
4 9:00 am Event Series Social Bowls TIME
Green Availability ORG
September 7 September 7 DATE
Ann Holmes PERSON
22 2:00 pm - 4:00 pm TIME
5573 CARDINAL
November 13 DATE
7:00 pm - 9:30 pm TIME
14 1:00 pm TIME
6:30 pm - 11:00 pm TIME
6:00 pm TIME
1 12:00 pm - 3:00 pm TIME
16 1 CARDINAL
Village Way LOC
the past couple of months DATE
9:30 am - 11:00 TIME
November 21, 2024 DATE
Drakes Pride Australia CAPTIVATE & OVERCOME ORG
September 9 September 9 DATE
First ORDINAL
January 2 DATE
February 7, 2024 DATE
Club-Helensvale-Cafe-Shoot-14 ORG
September 25 September 25 DATE
night TIME
September 26, 2024 DATE
two CARDINAL
September 28 DATE
1:00 pm - 2:30 pm TIME
Sunday DATE
8pm TIME
World Rankings ORG
Sep

In [26]:
with open("NER/ner_results.txt", 'a') as f:
    for ent in doc.ents:
        f.write(f"{ent.text},{ent.label_}\n")
    f.close()

# ChatGPT Generation of Questions

In [29]:
import os
from openai import OpenAI
from dotenv import load_dotenv, find_dotenv