In [2]:
import os
import requests
import zipfile
from io import BytesIO
import textwrap

def download_and_extract_zip(url, target_folder):

    # Download the file from the URL
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Failed to download file: {url}")

    # Unzip the file in memory
    with zipfile.ZipFile(BytesIO(response.content)) as zip_ref:
        zip_ref.extractall(target_folder)

    print(f"Files extracted to {target_folder}")

In [3]:
# URL of the zip file
url = "https://www.dropbox.com/scl/fi/av3nw07o5mo29cjokyp41/singapore_text_files_languages.zip?rlkey=xqdy5f1modtbnrzzga9024jyw&dl=1" # Ensure dl=1 for direct download

# Folder to save extracted files
folder = "singapore_text"

# Call the function
download_and_extract_zip(url, folder)

Files extracted to singapore_text


In [5]:

from langchain.embeddings import GooglePalmEmbeddings
from langchain.llms import GooglePalm
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.chroma import Chroma
import langchain
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader

os.environ['GOOGLE_API_KEY'] =  ''

In [8]:
loader = DirectoryLoader('singapore_text/Textfiles3/English/', glob="*.txt", show_progress=True)
docs = loader.load()


  0%|                                                   | 0/646 [00:00<?, ?it/s][A[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/saptarshimallikthakur/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.

  0%|                                         | 1/646 [00:24<4:26:11, 24.76s/it][A
  0%|▏                                        | 2/646 [00:24<1:50:31, 10.30s/it][A
  2%|▋                                         | 10/646 [00:25<14:15,  1.34s/it][A
  2%|▉                                         | 15/646 [00:25<08:02,  1.31it/s][A
  4%|█▌                                        | 24/646 [00:25<03:47,  2.73it/s][A
  5%|██▎                                       | 35/646 [00:25<01:59,  5.12it/s][A
  7%|██▊                                       | 44/646 [00:25<01:23,  7.18it/s][A
  8%|███▌                                      | 54/646 [00:25<00:54, 10.80it/s][A
 10%|████                                      | 63/646 [00

In [29]:
raw_text = ''
for i, doc in enumerate(docs):
    text = doc.page_content
    if text:
        raw_text += text
        
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap  = 100,
    length_function = len,
    is_separator_regex = False,
)

texts = text_splitter.split_text(raw_text)

In [25]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-small-en-v1.5"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

embedding_function = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    encode_kwargs=encode_kwargs,

)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/93.0k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [32]:
db = Chroma.from_texts(texts,
                       embedding_function)
                    #   persist_directory="chroma_db_rag_fusion")



### load from disk
#db = Chroma(persist_directory="chroma_db_rag_fusion", embedding_function=embedding_function)

In [34]:
query = "Tell me about Universal Studios Singapore?"

db.similarity_search(query, k=5)

[Document(page_content='Universal Studios Singapore'),
 Document(page_content='Native. 52A Amoy Street, Singapore 069878. +65 8869 6520.\n\nMon\n\nSat 6pm\n\n12am.\n\nFor night owls with supper cravingsLink: https://www.visitsingapore.com/see\n\ndo\n\nsingapore/recreation\n\nleisure/fun\n\nthings\n\nto\n\ndo/universal\n\nstudios\n\nsingapore/\n\nTitle: Universal Studios Singapore\n\nThe shimmering wonders of the silver screen comes to vivid life at Universal Studios Singapore, the first-ever Hollywood movie theme park in Southeast Asia.'),
 Document(page_content='Universal Studios Singapore is a popular theme park that offers thrilling rides and entertainment for all ages. You can buy Universal Studios Singapore tickets and enjoy an unbeatable discount of 40% from MySingaporePass. The pass provides a hassle-free booking process, allowing you to skip the long queues and gain easy access to the park. Enjoy exhilarating roller coasters, live shows, and immersive themed zones inspired by y

In [36]:
from operator import itemgetter

from langchain.chat_models import ChatGooglePalm

from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough

In [37]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

model = ChatGooglePalm()


chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [39]:
text_reply = chain.invoke("Tell me about Universal Studio Singapore")

print(text_reply)

Universal Studios Singapore is a theme park located at Resorts World Sentosa on Sentosa Island, Singapore. It is the first Universal Studios theme park in Southeast Asia and the second Universal Studios theme park in Asia after Universal Studios Japan. The park opened on 18 September 2010.

The park is divided into seven themed zones: Hollywood, New York, Sci-Fi City, Ancient Egypt, The Lost World, Far Far Away, and Madagascar. Each zone is home to a variety of rides, shows, and attractions based on popular movies and television shows.

Some of the most popular rides at Universal Studios Singapore include:

* The Mummy Ride: A roller coaster that takes guests on a journey through the tomb of Imhotep.
* Transformers The Ride: 3D Battle: A 3D motion simulator ride that puts guests in the middle of a battle between the Autobots and the Decepticons.
* Jurassic Park Rapids Adventure: A water ride that takes guests on a journey through the Jurassic Park lagoon.
* Shrek 4-D Adventure: A 4D mo