# Scraping the text

In [6]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time

visitedLinks = set()

In [7]:
def CrawlWebsite(url, baseUrl, maxDepth=1, currentDepth=0):
    if currentDepth > maxDepth or url in visitedLinks:
        return

    visitedLinks.add(url)

    # Get the webpage content
    try:
        response = requests.get(url)
        content_type = response.headers.get('content-type')

        if 'application/pdf' not in content_type:
            soup = BeautifulSoup(response.text, 'html.parser')

            # Extract text from the page
            pageText = soup.get_text(separator=' ', strip=True) #could use (separator=' ', strip=True) to have already parsed text
            print(f"URL: {url}")
            #print(f"Text:\n{pageText[:500]}...")  # Display first 500 characters
            with open('Dumps/restaurant.txt', 'a') as f:
              f.write(pageText)
              f.close()
            print("\n\n")

            # Find all internal links
            for link in soup.find_all('a', href=True):
                href = link['href']
                parsedHref = urlparse(href)

                # Check if the link is an internal link
                if not parsedHref.netloc or parsedHref.netloc == urlparse(baseUrl).netloc:
                    newUrl = urljoin(baseUrl, href)
                    CrawlWebsite(newUrl, baseUrl, maxDepth, currentDepth + 1)

    except requests.exceptions.RequestException as e:
        print(f"Failed to retrieve {url}: {e}")

    # Pause to avoid overwhelming the server
    time.sleep(0.1)

In [8]:
#startUrl = "https://www.pacificpinestavern.com.au/"
#startUrl = "https://clubhelensvale.com.au/"
startUrl = "https://www.thaikorner.com.au"
baseUrl = startUrl

CrawlWebsite(startUrl, baseUrl)

URL: https://www.thaikorner.com.au



URL: https://www.thaikorner.com.au/about-us



URL: https://www.thaikorner.com.au/menu



URL: https://www.thaikorner.com.au/contact-us



Failed to retrieve tel:+61-7-55294573: No connection adapters were found for 'tel:+61-7-55294573'
Failed to retrieve tel:+61-7-56706796: No connection adapters were found for 'tel:+61-7-56706796'


# Generating NERs

In [9]:
import spacy
nlp = spacy.load('en_core_web_md')

In [10]:
with open('Dumps/restaurant.txt', "rt") as f:
    text = f.read()

print(text)

ThaiKorner | Real Thai Cuisine Gold Coast top of page Home About Us Menu Contact Us More Use tab to navigate through the menu items. Phone COOMERA 07 5529 4573 PIMPAMA 07 5670 6796 Order Online Thai Korner Welcome to Authentic Thai Cuisine Order Online View Our Menu A LITTLE BACKGROUND Genuine Taste of Thai Food With a new level of complex flavours , our flavor comes from pairing bold aromatics and breaking the mould of the ordinary. Make a reservation at Thai Korner and enjoy our authentic Thai cuisine in a cozy and welcoming atmosphere. Our restaurant is perfect for family dinners, date nights, or any special occasion. Book now to secure your table! Bring Thai Korner to You & Your Loved Ones Our menu features a variety of delicious dishes made with fresh, high-quality ingredients. you're looking for a romantic dinner for two or a family gathering, we have something for everyone. Come and enjoy a memorable meal with your loved ones today. Contact Us locations Visit Us Location Thai Ko

In [11]:
doc = nlp(text)

print(len(doc.ents))
for ent in doc.ents:
    print(ent.text, ent.label_)

278
Thai NORP
Home About Us Menu Contact Us ORG
07 5529 DATE
PIMPAMA ORG
6796 CARDINAL
Authentic Thai Cuisine Order Online View ORG
Taste of Thai Food ORG
Thai Korner ORG
Thai NORP
date nights DATE
Bring Thai Korner ORG
two CARDINAL
today DATE
Coomera Shop ORG
07 CARDINAL
4:30PM - 9:00PM TIME
11:00AM - 9:00PM TIME
2:30PM - 3:30PM TIME
Thai NORP
Mon                  4:30PM - 9:00PM TIME
11:00AM - 9:00PM TIME
2:30PM - 3:30PM TIME
07 CARDINAL
6796 CARDINAL
Location Phone © ORG
2024 DATE
Thai Korner ORG
U PLUS ORG
Home About Us Menu Contact Us ORG
07 5529 DATE
PIMPAMA ORG
6796 CARDINAL
US GPE
Thai NORP
Asian NORP
Thai Korner ORG
Thailand GPE
CURRY Choice of Proteins Veggies & Tofu Chicken / Beef/Pork Prawns or ORG
4 CARDINAL
Jasmine Rice Selection of PERSON
Curry (Chicken / Beef / Pork ORG
Thai NORP
sultanas ORG
Coomera Shop ORG
07 CARDINAL
4:30PM - 9:00PM TIME
11:00AM - 9:00PM TIME
2:30PM - 3:30PM TIME
Thai NORP
Mon                  4:30PM - 9:00PM TIME
11:00AM - 9:00PM TIME
2:30PM - 3:30

In [12]:
from spacy import displacy

displacy.render(doc, style="ent")

In [13]:
# Remove the duplicates (significantly reduces number of tokens to process)
entities = set((ent.text, ent.label_) for ent in doc.ents)

print(len(entities))
for entity in entities:
    print(entity[0], entity[1])
    #print(entity)

134
Calamari PERSON
20.5 MONEY
Spicy Hokkien Hokkien NORP
07 CARDINAL
two CARDINAL
Asian NORP
19.5 MONEY
Oyster Sauce Mushroom PRODUCT
23.7 MONEY
Seasonal DATE
Lotus Root Chips ORG
Coomera Shop ORG
Mon                  4:30PM - 9:00PM TIME
Pad Kee Mow Thick PERSON
4 CARDINAL
$13.5 Chicken MONEY
Tom Yum PERSON
21.5 MONEY
Orange Juice ORG
11:00AM - 9:00PM TIME
7.8 MONEY
25.7 MONEY
Laksa NORP
12 MONEY
Lamb/Duck/Crispy Pork ORG
PIMPAMA ORG
22.5 CARDINAL
Chilli & Garlic Mushroom ORG
Tom Kha PERSON
9.5 MONEY
26.8 CARDINAL
Tues - Sun EVENT
mushroom(no coconut cream ORG
Veggies & Tofu  |   Large $ ORG
US GPE
Crispy Chicken ORG
Crispy Chicken WORK_OF_ART
Pad Prik King PERSON
Authentic Thai Cuisine Order Online View ORG
Bring Thai Korner ORG
Pineapple Fried Rice Special PERSON
19.9 MONEY
13.5 MONEY
Jasmine Rice Small PERSON
6 CARDINAL
Location Visit Us Location Thai Korner HomeCo ORG
Tom Kha Mild PERSON
Lamb/Duck/Crispy Pork/Barramundi Fillet ORG
14.7 MONEY
Rice PERSON
Beef Salad Chargrilled WOR

In [14]:
with open("NER/ner_results.txt", 'a') as f:
    for ent in doc.ents:
        f.write(f"{ent.text},{ent.label_}\n")
    f.close()

# ChatGPT Generation of Questions

In [22]:
import os
from openai import OpenAI
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())
client = OpenAI(
    api_key = os.environ.get('OPENAI_API_KEY'),
)

In [26]:
class FactPromptGenerator:
    def __init__(self, text, no_facts):
        self.text = text
        self.no_facts = no_facts
        self.system_message = """
            You are tasked with generating a list of facts from text.
            These facts must be truthful and accurate and reflect the information given in the text.
            The facts should also be quite short, they must be less than 7 words long each.
        """
    def GeneratePrompt(self):
        prompt = f"""
            I am seeking your expertise in distilling key information about text.
            Below I have given text that I have extracted from a certain restaurants website,
            could you please list {self.no_facts} facts about this restaurant.

            {self.text}
        """
        return prompt

In [38]:
def ProcessPrompt(promptGen):
    completion = client.chat.completions.create(
        model = "gpt-3.5-turbo",
        messages = [
            {"role": "system", "content": promptGen.system_message},
            {"role": "user", "content": promptGen.GeneratePrompt()}
        ],
        temperature = 0.3,
        max_tokens = 500,
    )
    return completion.choices[0].message.content

In [39]:
promptGen = FactPromptGenerator(text, 10)
facts = ProcessPrompt(promptGen)

In [36]:
class QuestionPromptGenerator:
    def __init__(self, facts, no_questions):
        self.facts = facts
        self.no_questions = no_questions
        self.system_message = """
            You are tasked with generating quiz questions about a certain restaurant from a list of facts about that restaurant.
            The questions should have multiple choice answers with one of them marked as correct.
            The other answers should be similar but slightly different from the correct answer to
            mislead a person trying to complete the quiz.
            These questions should be engaging for a customer of the restaurant to answer.
        """
    def GeneratePrompt(self):
        prompt = f"""
            I am seeking your expertise in generation of quiz questions about a certain restaurant.
            Below, I have given a list of facts about a certain restaurant that I would like you to use in order to generate the quiz questions.

            ---

            Instructions:

            - You must generate {self.no_questions} quiz questions
            - They must be multiple choice
            - They must have the correct answer highlighted
            - Incorrect answers must mislead the quiz participant

            ---

            Facts:

            {self.facts}
        """
        return prompt

In [41]:
promptGen = QuestionPromptGenerator(facts, 5)
questions = ProcessPrompt(promptGen)
print(questions)

Great! Based on the facts provided about ThaiKorner restaurant, here are 5 engaging quiz questions for you:

1. What type of cuisine does ThaiKorner offer on the Gold Coast?
   A) Italian
   B) Thai
   C) Mexican
   D) Japanese
   **Correct Answer: B) Thai**

2. In which Australian state can you find ThaiKorner's two locations?
   A) New South Wales
   B) Victoria
   C) Queensland
   D) Western Australia
   **Correct Answer: C) Queensland**

3. What kind of dining atmosphere does ThaiKorner provide?
   A) Formal and elegant
   B) Casual and relaxed
   C) Fast-paced and modern
   D) Upscale and luxurious
   **Correct Answer: B) Casual and relaxed**

4. What types of dishes can you find on ThaiKorner's menu?
   A) Italian and Chinese
   B) Mexican and Indian
   C) Thai and Japanese
   D) French and Greek
   **Correct Answer: C) Thai and Japanese**

5. When can you visit ThaiKorner for a meal?
   A) Breakfast and lunch
   B) Lunch and afternoon tea
   C) Lunch and dinner
   D) Dinner and 