In [None]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m67.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [None]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m225.3/232.6 kB[0m [31m10.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
!pip install PyPDF2



In [None]:
!pip install docx2txt

Collecting docx2txt
  Downloading docx2txt-0.9-py3-none-any.whl.metadata (529 bytes)
Downloading docx2txt-0.9-py3-none-any.whl (4.0 kB)
Installing collected packages: docx2txt
Successfully installed docx2txt-0.9


In [None]:
import os
os.environ["GOOGLE_API_KEY"] = "AIzaSyAgB6VC1zkti8Ri8-sFnZWFcgmlEO7Ah-I"

In [None]:
import faiss
from PyPDF2 import PdfReader
import docx2txt
from sentence_transformers import SentenceTransformer
import google.generativeai as genai
from dotenv import load_dotenv

def load_text_from_file(file_path):
    text = ""
    if file_path.endswith(".pdf"):
        reader = PdfReader(file_path)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    else:
        raise ValueError(f"Unsupported file type: {file_path}")
    return text

def split_text(text, chunk_size=1000, overlap=200):
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks


def build_faiss_index(chunks, model_name="sentence-transformers/all-MiniLM-L6-v2"):
    embedding_model = SentenceTransformer(model_name)
    embeddings = embedding_model.encode(chunks, convert_to_numpy=True, normalize_embeddings=True)

    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(embeddings)
    return index, embedding_model


def rag_query(question, index, documents, embedding_model, top_k=3):

    q_embedding = embedding_model.encode([question], convert_to_numpy=True, normalize_embeddings=True)
    D, I = index.search(q_embedding, k=top_k)
    retrieved_chunks = [documents[i] for i in I[0]]
    context = "\n".join(retrieved_chunks)
    prompt = f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"
    model = genai.GenerativeModel("gemini-1.5-flash")
    response = model.generate_content(prompt)

    return response.text


if __name__ == "__main__":

    files = [
        "/content/2307.06435v10.pdf",
        "/content/NIPS-2017-attention-is-all-you-need-Paper.pdf"
    ]


    all_chunks = []
    for f in files:
        try:
            text = load_text_from_file(f)
            chunks = split_text(text)
            all_chunks.extend(chunks)
        except FileNotFoundError:
            print(f"Error: File not found at {f}. Skipping.")
        except ValueError as e:
            print(f"Error processing file {f}: {e}. Skipping.")



    if all_chunks:
      index, embedding_model = build_faiss_index(all_chunks)
      print("Documents processed and FAISS index built!")


      while True:
          question = input("\nAsk a question about your documents (or type 'exit' to quit): ")
          if question.lower() == "exit":
              break
          try:
              answer = rag_query(question, index, all_chunks, embedding_model)
              print("\n--- Answer ---")
              print(answer)
          except Exception as e:
              print(f"Error: {e}")
    else:
      print("No documents were successfully processed. Cannot build FAISS index.")

Documents processed and FAISS index built!

Ask a question about your documents (or type 'exit' to quit): write about attention is all you need

--- Answer ---
"Attention is All You Need" is a landmark paper that introduced the Transformer architecture, a novel neural network architecture based solely on attention mechanisms, dispensing with recurrence and convolutions entirely.  This was a significant departure from previous sequence-to-sequence models which relied heavily on recurrent neural networks (RNNs) like LSTMs and GRUs.  The key innovation was the use of **self-attention**, allowing the model to weigh the importance of different parts of the input sequence when processing each element.

The paper demonstrated that self-attention could achieve state-of-the-art results on machine translation tasks, surpassing previous RNN-based models in both speed and quality.  This was due to several factors:

* **Parallelization:** Unlike RNNs, which process sequences sequentially, self-atte