In [1]:
import requests
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration
from datasets import Dataset, Features, Value, Sequence
from sentence_transformers import SentenceTransformer
import streamlit as st

# Function to read text file
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

# Function to split text into chunks
def split_text_into_chunks(text, chunk_size=512):
    sentences = text.split('.')
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) < chunk_size:
            current_chunk += sentence + "."
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + "."
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

# Download the text file (Pride and Prejudice by Jane Austen)
url = "https://www.gutenberg.org/files/1342/1342-0.txt"
response = requests.get(url)
txt_path = "pride_and_prejudice.txt"
with open(txt_path, "wb") as f:
    f.write(response.content)

# Extract and preprocess text
txt_path = "pride_and_prejudice.txt"
raw_text = read_text_file(txt_path)
chunks = split_text_into_chunks(raw_text)

# Load a pre-trained sentence transformer model
model_name = 'all-MiniLM-L6-v2'
embedder = SentenceTransformer(model_name)

# Generate embeddings for the chunks
embeddings = embedder.encode(chunks, convert_to_tensor=True)

# Create a dataset from the chunks and embeddings
data = {"title": [f"Chunk {i}" for i in range(len(chunks))], "text": chunks, "embeddings": embeddings.tolist()}
features = Features({"title": Value("string"), "text": Value("string"), "embeddings": Sequence(Value("float32"))})
dataset = Dataset.from_dict(data, features=features)


# Save the dataset and its index to disk
dataset_path = "dataset"
index_path = "index"
dataset.save_to_disk(dataset_path)
dataset.add_faiss_index(column="embeddings")
dataset.get_index("embeddings").save(index_path)

# Initialize the tokenizer, retriever, and model
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
retriever = RagRetriever.from_pretrained(
    "facebook/rag-token-nq",
    index_name="custom",
    passages_path=dataset_path,
    index_path=index_path
)
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq")

# Function to get answers using the RAG model
def get_answer(query):
    inputs = tokenizer(query, return_tensors="pt")
    input_ids = inputs["input_ids"]
    outputs = model.generate(input_ids, num_beams=5, num_return_sequences=1)
    answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    return answer

# Streamlit app
st.title("RAG-powered Chatbot")
st.write("Ask me anything about Pride and Prejudice!")

query = st.text_input("Your question:")
if query:
    answer = get_answer(query)
    st.write(f"Answer: {answer}")



Saving the dataset (0/1 shards):   0%|          | 0/1760 [00:00<?, ? examples/s]

OSError: [Errno 22] Invalid argument: 'c:/Users/Varad Acharya/Downloads/RAG/dataset/data-00000-of-00001.arrow'