In [1]:
# install libraries
!pip install gradio langchain-community langchain-openai langchain-chroma

Collecting gradio
  Downloading gradio-5.12.0-py3-none-any.whl.metadata (16 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.15-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-openai
  Downloading langchain_openai-0.3.1-py3-none-any.whl.metadata (2.7 kB)
Collecting langchain-chroma
  Downloading langchain_chroma-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.5.4 (from gradio)
  Downloading gradio_client-1.5.4-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Do

In [2]:
# imports libraries
import os
import requests
import json
from bs4 import BeautifulSoup
from openai import OpenAI
from google.colab import userdata
from tqdm import tqdm
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings
import glob
import gradio as gr

In [3]:
# get the API key from Google Colab's Secrets
openai_api_key = userdata.get('OPENAI_API_KEY')
openai = OpenAI(api_key=openai_api_key)

# feel free to change it to other models
MODEL = "gpt-4o"

In [4]:
# defines HTTP headers to simulate a legitimate browser user-agent string
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    a class to fetch and process content from a specified webpage URL.

    url (str): the URL of the webpage.
    body (bytes): raw HTML content of the webpage.
    title (str): the title of the webpage, if available.
    text (str): cleaned and visible text content of the webpage.
    links (list): a list of relevant links filtered by specific criteria.
    """
    def __init__(self, url):
        """
        Initializes the Website instance, fetches the webpage content, and processes it.

        url (str): The URL of the webpage to process.
        """
        self.url = url
        response = requests.get(url, headers=headers)

        # store the raw HTML content of the response
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')

        # extract the webpage title or provide a default message if no title is found
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose() # remove unnecessary tags
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]

        # filter out irrelevant links
        self.links = [link for link in links if link]
        self.links = [link for link in self.links if link.startswith("https://octopathtraveler.fandom.com/wiki/")]

    def get_links(self):
        """
        Returns a set of unique links found on the webpage.

        Returns:
            set: A set of unique links.
        """
        return set(self.links)

    def get_contents(self):
        """
        Returns the title and text content of the webpage.

        Returns:
            str: The title and text content of the webpage.
        """
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [5]:
all_links = Website("https://octopathtraveler.fandom.com/wiki/Octopath_Traveler_II").get_links()
all_links

{'https://octopathtraveler.fandom.com/wiki/Accessories',
 'https://octopathtraveler.fandom.com/wiki/Accessories_(Octopath_Traveler_II)',
 'https://octopathtraveler.fandom.com/wiki/Agnea_Bristarni',
 'https://octopathtraveler.fandom.com/wiki/Alfyn_Greengrass',
 'https://octopathtraveler.fandom.com/wiki/Apothecary',
 'https://octopathtraveler.fandom.com/wiki/Apothecary_(Octopath_Traveler_II)',
 'https://octopathtraveler.fandom.com/wiki/Blog:Recent_posts',
 'https://octopathtraveler.fandom.com/wiki/Body_Armor',
 'https://octopathtraveler.fandom.com/wiki/Body_Armor_(Octopath_Traveler_II)',
 'https://octopathtraveler.fandom.com/wiki/Brightlands',
 'https://octopathtraveler.fandom.com/wiki/Castti_Florenz',
 'https://octopathtraveler.fandom.com/wiki/Category:Bosses',
 'https://octopathtraveler.fandom.com/wiki/Category:Champions_of_the_Continent_Boss_Minions',
 'https://octopathtraveler.fandom.com/wiki/Category:Champions_of_the_Continent_Bosses',
 'https://octopathtraveler.fandom.com/wiki/Cate

In [6]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a knowledge base about this game, \
such as links to an Character page, Jobs page, Regions page,  etc.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "character", "url": "https://full.url/goes/here/character"},
        {"type": "location": "url": "https://another.full.url/location"}
    ]
}
"""


In [7]:
def get_links_user_prompt(website):
    """
    Generates a user-friendly prompt containing the list of links from a given website.

    Args:
        website: contains the URLs

    Returns:
        str: finished user prompt
    """
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a guide about the game, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [8]:
def get_links(url):
    """
    Fetches and processes links from a given URL using a chatbot model to determine relevant links.

    Args:
        url (str): The URL of the webpage to fetch and process.

    Returns:
        dict: A JSON object containing the filtered links as determined by the chatbot model.

    Raises:
        Exception: If the OpenAI API call fails or the response format is invalid.
    """
    website = Website(url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
      ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

In [9]:
useful_links = get_links("https://octopathtraveler.fandom.com/wiki/Category:Octopath_Traveler_II")
useful_links

{'links': [{'type': 'location',
   'url': 'https://octopathtraveler.fandom.com/wiki/Orsterra'},
  {'type': 'location',
   'url': 'https://octopathtraveler.fandom.com/wiki/Frostlands'},
  {'type': 'location',
   'url': 'https://octopathtraveler.fandom.com/wiki/Flatlands'},
  {'type': 'location',
   'url': 'https://octopathtraveler.fandom.com/wiki/Coastlands'},
  {'type': 'location',
   'url': 'https://octopathtraveler.fandom.com/wiki/Highlands'},
  {'type': 'location',
   'url': 'https://octopathtraveler.fandom.com/wiki/Sunlands'},
  {'type': 'location',
   'url': 'https://octopathtraveler.fandom.com/wiki/Riverlands'},
  {'type': 'location',
   'url': 'https://octopathtraveler.fandom.com/wiki/Cliftlands'},
  {'type': 'location',
   'url': 'https://octopathtraveler.fandom.com/wiki/Woodlands'},
  {'type': 'character',
   'url': 'https://octopathtraveler.fandom.com/wiki/Ophilia_Clement'},
  {'type': 'character',
   'url': 'https://octopathtraveler.fandom.com/wiki/Cyrus_Albright'},
  {'type

In [10]:
def message_for(file_name, content):
  system_prompt = (
    f"You are a specialized assistant tasked with gathering relevant and accurate information "
    f"about {file_name} related to Octopath Traveler (video game). When provided with contents, "
    f"your role is to extract and present all relevant details about {file_name}, focusing solely "
    f"on gathering information without summarizing, interpreting, or analyzing. Respond in markdown.")

  return [
    {"role": "system",
     "content": system_prompt},
    {"role": "user",
     "content": content}
  ]


In [11]:
def create_knowledge_base(useful_links, knowledge_base):
  if not os.path.exists(knowledge_base):
    os.mkdir(knowledge_base)
  for i in tqdm(range(len(useful_links["links"]))):
    folder_path = f"{knowledge_base}/{useful_links['links'][i]['type']}"
    file_name = useful_links['links'][i]['url'].split("/")[-1]
    file_path = os.path.join(folder_path, file_name) + ".md"
    if not os.path.exists(folder_path):
      os.mkdir(folder_path)
      print(f"{folder_path} created.")

    # uncommon the line below if more URLs are scrapped
    # if not os.path.exists(file_path):
    print(f"Creating knowledge for {file_name}")
    content = Website(useful_links['links'][i]['url']).get_contents()
    messages = message_for(file_name, content)
    response = openai.chat.completions.create(
      model=MODEL,
      messages=messages
    )

    # write knowledge into files
    with open(file_path, "w") as f:
      f.write(response.choices[0].message.content)
    print(f"\nDone!")

In [12]:
create_knowledge_base(useful_links, "octopath-knowledge")

  0%|          | 0/47 [00:00<?, ?it/s]

octopath-knowledge/location created.
Creating knowledge for Orsterra


  2%|▏         | 1/47 [00:14<10:57, 14.28s/it]


Done!
Creating knowledge for Frostlands


  4%|▍         | 2/47 [00:26<09:36, 12.80s/it]


Done!
Creating knowledge for Flatlands


  6%|▋         | 3/47 [00:46<11:56, 16.29s/it]


Done!
Creating knowledge for Coastlands


  9%|▊         | 4/47 [00:57<10:06, 14.10s/it]


Done!
Creating knowledge for Highlands


 11%|█         | 5/47 [01:12<10:13, 14.60s/it]


Done!
Creating knowledge for Sunlands


 13%|█▎        | 6/47 [01:28<10:17, 15.06s/it]


Done!
Creating knowledge for Riverlands


 15%|█▍        | 7/47 [01:52<11:59, 17.99s/it]


Done!
Creating knowledge for Cliftlands


 17%|█▋        | 8/47 [02:41<18:07, 27.87s/it]


Done!
Creating knowledge for Woodlands


 19%|█▉        | 9/47 [02:55<14:50, 23.45s/it]


Done!
octopath-knowledge/character created.
Creating knowledge for Ophilia_Clement


 21%|██▏       | 10/47 [03:17<14:16, 23.15s/it]


Done!
Creating knowledge for Cyrus_Albright


 23%|██▎       | 11/47 [03:39<13:35, 22.66s/it]


Done!
Creating knowledge for Tressa_Colzione


 26%|██▌       | 12/47 [04:03<13:31, 23.17s/it]


Done!
Creating knowledge for Olberic_Eisenberg


 28%|██▊       | 13/47 [04:18<11:38, 20.53s/it]


Done!
Creating knowledge for Primrose_Azelhart


 30%|██▉       | 14/47 [04:31<10:06, 18.39s/it]


Done!
Creating knowledge for Alfyn_Greengrass


 32%|███▏      | 15/47 [04:46<09:10, 17.22s/it]


Done!
Creating knowledge for Therion


 34%|███▍      | 16/47 [05:15<10:44, 20.78s/it]


Done!
Creating knowledge for H%27aanit


 36%|███▌      | 17/47 [05:32<09:49, 19.67s/it]


Done!
octopath-knowledge/page created.
Creating knowledge for Jobs


 38%|███▊      | 18/47 [05:45<08:31, 17.63s/it]


Done!
Creating knowledge for Category:Inventory


 40%|████      | 19/47 [05:51<06:41, 14.36s/it]


Done!
Creating knowledge for Category:Game_Mechanics


 43%|████▎     | 20/47 [06:10<07:00, 15.56s/it]


Done!
Creating knowledge for Category:Enemies


 45%|████▍     | 21/47 [06:17<05:38, 13.01s/it]


Done!
Creating knowledge for Category:Bosses


 47%|████▋     | 22/47 [06:31<05:30, 13.23s/it]


Done!
Creating knowledge for Side_Stories


 49%|████▉     | 23/47 [06:48<05:47, 14.46s/it]


Done!
Creating knowledge for Octopath_Traveler_II


 51%|█████     | 24/47 [07:02<05:27, 14.23s/it]


Done!
Creating knowledge for Solistia


 53%|█████▎    | 25/47 [07:26<06:22, 17.37s/it]


Done!
Creating knowledge for Toto%27haha


 55%|█████▌    | 26/47 [07:38<05:31, 15.78s/it]


Done!
Creating knowledge for Harborlands


 57%|█████▋    | 27/47 [07:51<04:58, 14.94s/it]


Done!
Creating knowledge for Brightlands


 60%|█████▉    | 28/47 [08:04<04:31, 14.31s/it]


Done!
Creating knowledge for Winterlands


 62%|██████▏   | 29/47 [08:16<04:02, 13.50s/it]


Done!
Creating knowledge for Wildlands


 64%|██████▍   | 30/47 [08:24<03:21, 11.86s/it]


Done!
Creating knowledge for Leaflands


 66%|██████▌   | 31/47 [08:36<03:12, 12.05s/it]


Done!
Creating knowledge for Crestlands


 68%|██████▊   | 32/47 [08:58<03:41, 14.79s/it]


Done!
Creating knowledge for Hinoeuma


 70%|███████   | 33/47 [09:22<04:07, 17.64s/it]


Done!
Creating knowledge for Ochette


 72%|███████▏  | 34/47 [09:48<04:24, 20.34s/it]


Done!
Creating knowledge for Castti_Florenz


 74%|███████▍  | 35/47 [10:09<04:04, 20.36s/it]


Done!
Creating knowledge for Thron%C3%A9_Anguis


 77%|███████▋  | 36/47 [10:24<03:25, 18.69s/it]


Done!
Creating knowledge for Osvald_V._Vanstein


 79%|███████▊  | 37/47 [10:35<02:43, 16.36s/it]


Done!
Creating knowledge for Partitio_Yellowill


 81%|████████  | 38/47 [11:05<03:05, 20.62s/it]


Done!
Creating knowledge for Agnea_Bristarni


 83%|████████▎ | 39/47 [11:22<02:35, 19.41s/it]


Done!
Creating knowledge for Temenos_Mistral


 85%|████████▌ | 40/47 [11:46<02:26, 20.90s/it]


Done!
Creating knowledge for Hikari_Ku


 87%|████████▋ | 41/47 [12:19<02:26, 24.36s/it]


Done!
Creating knowledge for Jobs_(Octopath_Traveler_II)


 89%|████████▉ | 42/47 [12:25<01:35, 19.08s/it]


Done!
Creating knowledge for Inventory


 91%|█████████▏| 43/47 [12:31<01:00, 15.18s/it]


Done!
Creating knowledge for Category:Octopath_Traveler_II_Game_Mechanics


 94%|█████████▎| 44/47 [12:36<00:35, 11.89s/it]


Done!
Creating knowledge for Category:Octopath_Traveler_II_Enemies


 96%|█████████▌| 45/47 [12:47<00:23, 11.80s/it]


Done!
Creating knowledge for Category:Octopath_Traveler_II_Bosses


 98%|█████████▊| 46/47 [13:01<00:12, 12.38s/it]


Done!
Creating knowledge for Side_Stories_(Octopath_Traveler_II)


100%|██████████| 47/47 [13:23<00:00, 17.10s/it]


Done!





In [22]:
db_name = "octopath_db"

# take everything in all the sub-folders of the knowledge base
folders = glob.glob("octopath-knowledge/*")

def add_metadata(doc, doc_type):
    doc.metadata["doc_type"] = doc_type
    return doc

# try this code if the line below doesn't work: text_loader_kwargs={'autodetect_encoding': True}
text_loader_kwargs = {'encoding': 'utf-8'}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    documents.extend([add_metadata(doc, doc_type) for doc in folder_docs])

text_splitter = CharacterTextSplitter(chunk_size=1500, chunk_overlap=300)
chunks = text_splitter.split_documents(documents)

document_types = list(set([doc.metadata['doc_type'] for doc in documents]))
print(f"Total number of chunks: {len(chunks)}")
print(f"Document types found: {document_types}")

Total number of chunks: 109
Document types found: ['page', 'character', 'location']


In [23]:
# Put the chunks of data into a Vector Store that associates a Vector Embedding with each chunk

embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
# An alternative approach
# from langchain.embeddings import HuggingFaceEmbeddings
# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Delete DB if already exists
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

# Create vectorstore
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 109 documents


In [24]:
import random
from matplotlib import colors

def generate_random_colors(num_colors):
    color_names = list(colors.CSS4_COLORS.keys())
    random_colors = random.sample(color_names, num_colors)
    return random_colors

num_colors = len(set(doc.metadata['doc_type'] for doc in documents))
random_colors = generate_random_colors(num_colors)
print(random_colors)

['mediumblue', 'slateblue', 'lightblue']


In [25]:
# prework for visualizing vectorstore

collection = vectorstore._collection
result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
metadatas = result['metadatas']
doc_types = [metadata['doc_type'] for metadata in metadatas]
colors = [random_colors[document_types.index(t)] for t in doc_types]

In [31]:
# visualize vectorstore in 2D

tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [32]:
# 3D

tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [28]:
# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.5, model_name=MODEL, openai_api_key=userdata.get("OPENAI_API_KEY"))

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})

# putting it together: set up the conversation chain with the LLM model, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)


Please see the migration guide at: https://python.langchain.com/docs/versions/migrating_memory/



In [29]:
# Handles a conversational interaction by processing a user's question within a conversation context.
def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    return result["answer"]

In [30]:
# launch the Gradio interface, ask the Chatbot questions about Octopath Traveler 2
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://306b90f4a01cceedbf.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
