In [28]:

from transformers import AutoTokenizer, GPT2LMHeadModel


In [35]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

In [2]:
pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.4.0-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.4.0-py3-none-any.whl (275 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.4.0


In [12]:
import PyPDF2

In [11]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [3]:
import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch



In [18]:
def load_corpus(file_path):
    corpus = []
    doc_names = []
    if file_path.endswith(".txt"):
        with open(file_path, 'r', encoding='utf-8') as f:
            corpus.append(f.read())
            doc_names.append(os.path.basename(file_path))
    elif file_path.endswith(".pdf"):
        with open(file_path, 'rb') as f:
            reader = PyPDF2.PdfReader(f)
            text = "".join(page.extract_text() for page in reader.pages)
            corpus.append(text)
            doc_names.append(os.path.basename(file_path))
    else:
        raise ValueError("Unsupported file format. Please provide a .txt or .pdf file.")
    return corpus, doc_names

In [19]:
class EmbeddingSearch:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)
        self.doc_embeddings = None

    def fit(self, corpus):
      self.corpus = []
      self.doc_embeddings = []
      for document in corpus:
          chunks = [document[i:i+512] for i in range(0, len(document), 512)]  # Split into chunks of 512 tokens
          self.corpus.extend(chunks)
          chunk_embeddings = self.model.encode(chunks, convert_to_tensor=True)
          self.doc_embeddings.append(chunk_embeddings)
      self.doc_embeddings = torch.cat(self.doc_embeddings)


    def search(self, query, top_k=5):
      query = query[:512]
      query_embedding = self.model.encode(query, convert_to_tensor=True)


      similarities = cosine_similarity(query_embedding.unsqueeze(0), self.doc_embeddings)[0]
      top_indices = np.argsort(similarities)[::-1][:top_k]

      return [(self.corpus[idx], similarities[idx]) for idx in top_indices]


In [40]:
import torch
from transformers import pipeline

class EmbeddingSearch:
    def fit(self, corpus):
        # Dummy implementation for embedding search setup
        self.corpus = corpus

    def search(self, query, top_k=3):
        # Dummy implementation for searching relevant documents
        results = [(doc, 1.0 - i * 0.1) for i, doc in enumerate(self.corpus[:top_k])]
        return results

def load_corpus(file_path):
    if file_path.endswith(".txt"):
        with open(file_path, "r") as file:
            text = file.read()
        return text.split("\n\n"), ["Document"]
    elif file_path.endswith(".pdf"):
        try:
            from PyPDF2 import PdfReader
            reader = PdfReader(file_path)
            text = "\n".join(page.extract_text() for page in reader.pages)
            return text.split("\n\n"), ["Document"]
        except ImportError:
            raise Exception("Install PyPDF2 to process PDF files.")
    else:
        raise Exception("Unsupported file format. Use .txt or .pdf files.")

def fine_tune_transformer(model_name):
    from transformers import AutoModelForCausalLM, AutoTokenizer

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    return tokenizer, model

def generate_response(gpt_pipeline, query, relevant_docs):
    # Check if relevant_docs is valid
    if not relevant_docs or not isinstance(relevant_docs[0], (list, tuple)):
        print("No relevant documents found or incorrect structure.")
        return "I couldn't find relevant information to answer your query."

    # Extract the context
    context = relevant_docs[0][0]
    if not isinstance(context, str):
        context = str(context)

    # Truncate the context to a reasonable length
    context = context[:512]

    # Construct the prompt
    prompt = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"

    try:
        response = gpt_pipeline(prompt, max_new_tokens=100, num_return_sequences=1)
        if isinstance(response, list) and "generated_text" in response[0]:
            return response[0]["generated_text"]
        else:
            return "I couldn't generate a valid response. Please try again."
    except Exception as e:
        print(f"Error generating response: {e}")
        return "An error occurred while generating the response."

if __name__ == "__main__":
    # File path for the document
    file_path = input("Enter the path to your document (txt or pdf): ").strip()

    # Load corpus
    print("Loading document...")
    try:
        corpus, doc_names = load_corpus(file_path)
    except Exception as e:
        print(f"Error: {e}")
        exit()

    # Initialize embedding-based search
    print("Generating embeddings...")
    embed_search = EmbeddingSearch()
    embed_search.fit(corpus)

    # Initialize GPT model pipeline
    print("Loading GPT model...")
    tokenizer, model = fine_tune_transformer("gpt2")
    gpt_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

    while True:
        print("\nEnter your search query (or 'exit' to quit):")
        query = input().strip()
        if query.lower() == 'exit':
            break

        # Search for relevant documents
        print("Searching for relevant content...")
        relevant_docs = embed_search.search(query, top_k=3)

        print("\nRelevant Content:")
        for i, (doc, score) in enumerate(relevant_docs):
            print(f"{i+1}. (Score: {score:.2f})\n{doc[:200]}...")

        # Generate a response using GPT
        print("\nGenerating response using GPT...")
        response = generate_response(gpt_pipeline, query, relevant_docs)
        print(f"\nResponse:\n{response}")


Loading document...
Generating embeddings...
Loading GPT model...


Device set to use cpu



Enter your search query (or 'exit' to quit):


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Searching for relevant content...

Relevant Content:
1. (Score: 1.00)
See discussions, st ats, and author pr ofiles f or this public ation at : https://www .researchgate.ne t/public ation/344717762
Machine Learning Algorithms -A Review
Technic al R eport    in  Internat...

Generating response using GPT...

Response:
Context: See discussions, st ats, and author pr ofiles f or this public ation at : https://www .researchgate.ne t/public ation/344717762
Machine Learning Algorithms -A Review
Technic al R eport    in  International Journal of Scienc e and R esearch (IJSR)  · Januar y 2019
DOI: 10.21275/ ART20203995
CITATIONS
1,333READS
410,174
1 author:
Batt a Mahesh
Independent R esearcher
5 PUBLICA TIONS    1,341  CITATIONS    
SEE PROFILE
All c ontent f ollo wing this p age was uplo aded b y Batt a Mahesh  on 17 Oct ober 2020.
Th

Question: What is Machine Learning?

Answer: Machine learning refers to what I shall called the computer process to learn new skills, not the process of memor

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Searching for relevant content...

Relevant Content:
1. (Score: 1.00)
See discussions, st ats, and author pr ofiles f or this public ation at : https://www .researchgate.ne t/public ation/344717762
Machine Learning Algorithms -A Review
Technic al R eport    in  Internat...

Generating response using GPT...

Response:
Context: See discussions, st ats, and author pr ofiles f or this public ation at : https://www .researchgate.ne t/public ation/344717762
Machine Learning Algorithms -A Review
Technic al R eport    in  International Journal of Scienc e and R esearch (IJSR)  · Januar y 2019
DOI: 10.21275/ ART20203995
CITATIONS
1,333READS
410,174
1 author:
Batt a Mahesh
Independent R esearcher
5 PUBLICA TIONS    1,341  CITATIONS    
SEE PROFILE
All c ontent f ollo wing this p age was uplo aded b y Batt a Mahesh  on 17 Oct ober 2020.
Th

Question: what the document is talking?

Answer: The I N O T N C C R O E M E L E T A S O R D S C R O N A '

What is I C C R e r i n g?

Answer: What is i c l 

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Searching for relevant content...

Relevant Content:
1. (Score: 1.00)
See discussions, st ats, and author pr ofiles f or this public ation at : https://www .researchgate.ne t/public ation/344717762
Machine Learning Algorithms -A Review
Technic al R eport    in  Internat...

Generating response using GPT...

Response:
Context: See discussions, st ats, and author pr ofiles f or this public ation at : https://www .researchgate.ne t/public ation/344717762
Machine Learning Algorithms -A Review
Technic al R eport    in  International Journal of Scienc e and R esearch (IJSR)  · Januar y 2019
DOI: 10.21275/ ART20203995
CITATIONS
1,333READS
410,174
1 author:
Batt a Mahesh
Independent R esearcher
5 PUBLICA TIONS    1,341  CITATIONS    
SEE PROFILE
All c ontent f ollo wing this p age was uplo aded b y Batt a Mahesh  on 17 Oct ober 2020.
Th

Question: quit

Answer:

1.1 We are making use of a type of machine learning system which is being used by students for their studies in R and computer scienc

In [38]:
pip install gpt4all

Collecting gpt4all
  Downloading gpt4all-2.8.2-py3-none-manylinux1_x86_64.whl.metadata (4.8 kB)
Downloading gpt4all-2.8.2-py3-none-manylinux1_x86_64.whl (121.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.6/121.6 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gpt4all
Successfully installed gpt4all-2.8.2
