# Scraping the text

In [22]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time

visitedLinks = set()

In [23]:
def CrawlWebsite(url, baseUrl, maxDepth=1, currentDepth=0):
    if currentDepth > maxDepth or url in visitedLinks:
        return

    visitedLinks.add(url)

    # Get the webpage content
    try:
        response = requests.get(url)
        content_type = response.headers.get('content-type')

        if 'application/pdf' not in content_type:
            soup = BeautifulSoup(response.text, 'html.parser')

            # Extract text from the page
            pageText = soup.get_text(separator=' ', strip=True) #could use (separator=' ', strip=True) to have already parsed text
            print(f"URL: {url}")
            #print(f"Text:\n{pageText[:500]}...")  # Display first 500 characters
            with open('Dumps/restaurant.txt', 'a') as f:
              f.write(pageText)
              f.close()
            print("\n\n")

            # Find all internal links
            for link in soup.find_all('a', href=True):
                href = link['href']
                parsedHref = urlparse(href)

                # Check if the link is an internal link
                if not parsedHref.netloc or parsedHref.netloc == urlparse(baseUrl).netloc:
                    newUrl = urljoin(baseUrl, href)
                    CrawlWebsite(newUrl, baseUrl, maxDepth, currentDepth + 1)

    except requests.exceptions.RequestException as e:
        print(f"Failed to retrieve {url}: {e}")

    # Pause to avoid overwhelming the server
    time.sleep(0.1)

In [24]:
startUrl = "https://www.pacificpinestavern.com.au/"
#startUrl = "https://clubhelensvale.com.au/"
#startUrl = "https://www.thaikorner.com.au"
baseUrl = startUrl

CrawlWebsite(startUrl, baseUrl)

URL: https://www.pacificpinestavern.com.au/



URL: https://www.pacificpinestavern.com.au/#home-banner-section



URL: https://www.pacificpinestavern.com.au/#home-quick-links-section



URL: https://www.pacificpinestavern.com.au/#home-featured-content-block-section



URL: https://www.pacificpinestavern.com.au/#home-core-content-section



URL: https://www.pacificpinestavern.com.au/#home-whats-on-section



URL: https://www.pacificpinestavern.com.au/#family-bistro-section



URL: https://www.pacificpinestavern.com.au/#new-page-section



URL: https://www.pacificpinestavern.com.au/#home-dining-2-section



URL: https://www.pacificpinestavern.com.au/#home-functions-section



URL: https://www.pacificpinestavern.com.au/#home-gallery-section



URL: https://www.pacificpinestavern.com.au/#home-sfmc-popup-section-section



URL: https://www.pacificpinestavern.com.au/dining#dining-room-1-section



URL: https://www.pacificpinestavern.com.au/dining#dining-room-2-section



URL: https://www.pac

# Generating NERs

In [9]:
import spacy
nlp = spacy.load('en_core_web_md')

In [20]:
with open('Dumps/restaurant.txt', "rt") as f:
    text = f.read()

print(text)

Our location 07 5573 1491 Home Eat + Drink Play What’s On Live Entertainment Events Functions Bowls Bowls Info Calendar Social Bowls & Events Club Championships – Men Club Championships – Women Club Championships – Mixed Barefoot Bowls Jack Attack Bowls Gallery About Membership Our Community Our Partners Advertising Careers Club Services & Facilities Entry Requirements Future Building Projects Our Board of Directors Contact Us Home Eat + Drink Play What’s On What’s On Live Entertainment Events Bowls Bowls Calendar Social Bowls & Events Bowls Info Barefoot Bowls Jack Attack Bowls Gallery About About Membership Our Community Our Partners Advertising Careers Club Services & Facilities Entry Requirements Future Building Projects Our Board of Directors Contact Us 07 5573 1491 Page not found Sorry, that page does not exist or has been moved. Click here to go to our home page . Club information Latest News Functions Careers Premium Rewards Club Services & Facilities Entry Requirements Find us

In [11]:
doc = nlp(text)

print(len(doc.ents))
for ent in doc.ents:
    print(ent.text, ent.label_)

278
Thai NORP
Home About Us Menu Contact Us ORG
07 5529 DATE
PIMPAMA ORG
6796 CARDINAL
Authentic Thai Cuisine Order Online View ORG
Taste of Thai Food ORG
Thai Korner ORG
Thai NORP
date nights DATE
Bring Thai Korner ORG
two CARDINAL
today DATE
Coomera Shop ORG
07 CARDINAL
4:30PM - 9:00PM TIME
11:00AM - 9:00PM TIME
2:30PM - 3:30PM TIME
Thai NORP
Mon                  4:30PM - 9:00PM TIME
11:00AM - 9:00PM TIME
2:30PM - 3:30PM TIME
07 CARDINAL
6796 CARDINAL
Location Phone © ORG
2024 DATE
Thai Korner ORG
U PLUS ORG
Home About Us Menu Contact Us ORG
07 5529 DATE
PIMPAMA ORG
6796 CARDINAL
US GPE
Thai NORP
Asian NORP
Thai Korner ORG
Thailand GPE
CURRY Choice of Proteins Veggies & Tofu Chicken / Beef/Pork Prawns or ORG
4 CARDINAL
Jasmine Rice Selection of PERSON
Curry (Chicken / Beef / Pork ORG
Thai NORP
sultanas ORG
Coomera Shop ORG
07 CARDINAL
4:30PM - 9:00PM TIME
11:00AM - 9:00PM TIME
2:30PM - 3:30PM TIME
Thai NORP
Mon                  4:30PM - 9:00PM TIME
11:00AM - 9:00PM TIME
2:30PM - 3:30

In [12]:
from spacy import displacy

displacy.render(doc, style="ent")

In [13]:
# Remove the duplicates (significantly reduces number of tokens to process)
entities = set((ent.text, ent.label_) for ent in doc.ents)

print(len(entities))
for entity in entities:
    print(entity[0], entity[1])
    #print(entity)

134
Calamari PERSON
20.5 MONEY
Spicy Hokkien Hokkien NORP
07 CARDINAL
two CARDINAL
Asian NORP
19.5 MONEY
Oyster Sauce Mushroom PRODUCT
23.7 MONEY
Seasonal DATE
Lotus Root Chips ORG
Coomera Shop ORG
Mon                  4:30PM - 9:00PM TIME
Pad Kee Mow Thick PERSON
4 CARDINAL
$13.5 Chicken MONEY
Tom Yum PERSON
21.5 MONEY
Orange Juice ORG
11:00AM - 9:00PM TIME
7.8 MONEY
25.7 MONEY
Laksa NORP
12 MONEY
Lamb/Duck/Crispy Pork ORG
PIMPAMA ORG
22.5 CARDINAL
Chilli & Garlic Mushroom ORG
Tom Kha PERSON
9.5 MONEY
26.8 CARDINAL
Tues - Sun EVENT
mushroom(no coconut cream ORG
Veggies & Tofu  |   Large $ ORG
US GPE
Crispy Chicken ORG
Crispy Chicken WORK_OF_ART
Pad Prik King PERSON
Authentic Thai Cuisine Order Online View ORG
Bring Thai Korner ORG
Pineapple Fried Rice Special PERSON
19.9 MONEY
13.5 MONEY
Jasmine Rice Small PERSON
6 CARDINAL
Location Visit Us Location Thai Korner HomeCo ORG
Tom Kha Mild PERSON
Lamb/Duck/Crispy Pork/Barramundi Fillet ORG
14.7 MONEY
Rice PERSON
Beef Salad Chargrilled WOR

In [14]:
with open("NER/ner_results.txt", 'a') as f:
    for ent in doc.ents:
        f.write(f"{ent.text},{ent.label_}\n")
    f.close()

# ChatGPT Generation of Questions

In [9]:
import os
from openai import OpenAI
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())
client = OpenAI(
    api_key = os.environ.get('OPENAI_API_KEY'),
)

In [10]:
class FactPromptGenerator:
    def __init__(self, text, no_facts):
        self.text = text
        self.no_facts = no_facts
        self.system_message = """
            You are tasked with generating a list of facts from text.
            These facts must be truthful and accurate and reflect the information given in the text.
            The facts should also be quite short, they must be less than 7 words long each.
        """
    def GeneratePrompt(self):
        prompt = f"""
            I am seeking your expertise in distilling key information about text.
            Below I have given text that I have extracted from a certain restaurants website,
            could you please list {self.no_facts} facts about this restaurant.

            {self.text}
        """
        return prompt

In [13]:
def ProcessPrompt(promptGen):
    completion = client.chat.completions.create(
        model = "gpt-4o-mini",
        messages = [
            {"role": "system", "content": promptGen.system_message},
            {"role": "user", "content": promptGen.GeneratePrompt()}
        ],
        temperature = 0.3,
        max_tokens = 500,
    )
    return completion.choices[0].message.content

In [14]:
promptGen = FactPromptGenerator(text, 10)
facts = ProcessPrompt(promptGen)
print(facts)

1. Club Helensvale is located in Helensvale.
2. Open Sunday to Thursday, 8:30am-1am.
3. Open Friday and Saturday, 8:30am-2am.
4. Offers live entertainment on weekends.
5. Hosts social and competitive lawn bowls.
6. Features a café and bistro dining.
7. Offers barefoot bowls for casual players.
8. Provides a community benefit funding program.
9. Has a premium rewards membership program.
10. Recently renovated with modern facilities.


In [15]:
class QuestionPromptGenerator:
    def __init__(self, facts, no_questions):
        self.facts = facts
        self.no_questions = no_questions
        self.system_message = """
            You are tasked with generating quiz questions about a certain restaurant from a list of facts about that restaurant.
            The questions should have multiple choice answers with one of them marked as correct.
            The other answers should be similar but slightly different from the correct answer to
            mislead a person trying to complete the quiz.
            These questions should be engaging for a customer of the restaurant to answer.
        """
    def GeneratePrompt(self):
        prompt = f"""
            I am seeking your expertise in generation of quiz questions about a certain restaurant.
            Below, I have given a list of facts about a certain restaurant that I would like you to use in order to generate the quiz questions.

            ---

            Instructions:

            - You must generate {self.no_questions} quiz questions
            - They must be multiple choice
            - They must have the correct answer highlighted
            - Incorrect answers must mislead the quiz participant

            ---

            Facts:

            {self.facts}
        """
        return prompt

In [16]:
promptGen = QuestionPromptGenerator(facts, 5)
questions = ProcessPrompt(promptGen)
print(questions)

Sure! Here are five engaging quiz questions based on the facts about Club Helensvale:

---

**Question 1:** Where is Club Helensvale located?  
A) Helensvale  
B) Helensville  
C) Helensvale Bay  
D) Helensvale Park  

**Correct Answer:** A) **Helensvale**

---

**Question 2:** What are the opening hours for Club Helensvale on Fridays and Saturdays?  
A) 8:30am-1am  
B) 8:30am-12am  
C) 8:30am-2am  
D) 9:00am-2am  

**Correct Answer:** C) **8:30am-2am**

---

**Question 3:** What type of entertainment does Club Helensvale offer on weekends?  
A) Movie screenings  
B) Live entertainment  
C) Trivia nights  
D) Karaoke sessions  

**Correct Answer:** B) **Live entertainment**

---

**Question 4:** What recreational activity can you participate in at Club Helensvale?  
A) Indoor bowling  
B) Lawn bowls  
C) Mini-golf  
D) Croquet  

**Correct Answer:** B) **Lawn bowls**

---

**Question 5:** What program does Club Helensvale offer that benefits the community?  
A) Community outreach progr

# Demo

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time

visitedLinks = set()

In [None]:
def CrawlWebsite(url, baseUrl, maxDepth=1, currentDepth=0):
    if currentDepth > maxDepth or url in visitedLinks:
        return

    visitedLinks.add(url)

    # Get the webpage content
    try:
        response = requests.get(url)
        content_type = response.headers.get('content-type')

        if 'application/pdf' not in content_type:
            soup = BeautifulSoup(response.text, 'html.parser')

            # Extract text from the page
            pageText = soup.get_text(separator=' ', strip=True) #could use (separator=' ', strip=True) to have already parsed text
            print(f"URL: {url}")
            #print(f"Text:\n{pageText[:500]}...")  # Display first 500 characters
            with open('Dumps/restaurant.txt', 'a') as f:
              f.write(pageText)
              f.close()
            print("\n\n")

            # Find all internal links
            for link in soup.find_all('a', href=True):
                href = link['href']
                parsedHref = urlparse(href)

                # Check if the link is an internal link
                if not parsedHref.netloc or parsedHref.netloc == urlparse(baseUrl).netloc:
                    newUrl = urljoin(baseUrl, href)
                    CrawlWebsite(newUrl, baseUrl, maxDepth, currentDepth + 1)

    except requests.exceptions.RequestException as e:
        print(f"Failed to retrieve {url}: {e}")

    # Pause to avoid overwhelming the server
    time.sleep(0.1)

In [None]:
#startUrl = "https://www.pacificpinestavern.com.au/"
startUrl = "https://clubhelensvale.com.au/"
#startUrl = "https://www.thaikorner.com.au"
baseUrl = startUrl

CrawlWebsite(startUrl, baseUrl)

In [None]:
with open('Dumps/restaurant.txt', "rt") as f:
    text = f.read()

print(text)

In [None]:
import os
from openai import OpenAI
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())
client = OpenAI(
    api_key = os.environ.get('OPENAI_API_KEY'),
)

In [None]:
class FactPromptGenerator:
    def __init__(self, text, no_facts):
        self.text = text
        self.no_facts = no_facts
        self.system_message = """
            You are tasked with generating a list of facts from text.
            These facts must be truthful and accurate and reflect the information given in the text.
            The facts should also be quite short, they must be less than 7 words long each.
        """
    def GeneratePrompt(self):
        prompt = f"""
            I am seeking your expertise in distilling key information about text.
            Below I have given text that I have extracted from a certain restaurants website,
            could you please list {self.no_facts} facts about this restaurant.

            {self.text}
        """
        return prompt

In [None]:
def ProcessPrompt(promptGen):
    completion = client.chat.completions.create(
        model = "gpt-4o-mini",
        messages = [
            {"role": "system", "content": promptGen.system_message},
            {"role": "user", "content": promptGen.GeneratePrompt()}
        ],
        temperature = 0.3,
        max_tokens = 500,
    )
    return completion.choices[0].message.content

In [None]:
promptGen = FactPromptGenerator(text, 10)
facts = ProcessPrompt(promptGen)
print(facts)

In [None]:
class QuestionPromptGenerator:
    def __init__(self, facts, no_questions):
        self.facts = facts
        self.no_questions = no_questions
        self.system_message = """
            You are tasked with generating quiz questions about a certain restaurant from a list of facts about that restaurant.
            The questions should have multiple choice answers with one of them marked as correct.
            The other answers should be similar but slightly different from the correct answer to
            mislead a person trying to complete the quiz.
            These questions should be engaging for a customer of the restaurant to answer.
        """
    def GeneratePrompt(self):
        prompt = f"""
            I am seeking your expertise in generation of quiz questions about a certain restaurant.
            Below, I have given a list of facts about a certain restaurant that I would like you to use in order to generate the quiz questions.

            ---

            Instructions:

            - You must generate {self.no_questions} quiz questions
            - They must be multiple choice
            - They must have the correct answer highlighted
            - Incorrect answers must mislead the quiz participant

            ---

            Facts:

            {self.facts}
        """
        return prompt

In [None]:
promptGen = QuestionPromptGenerator(facts, 5)
questions = ProcessPrompt(promptGen)
print(questions)