In [244]:
from transformers import BertModel, BertTokenizer, BertConfig

model_dir = "bert"

config_directory = "bert/config.json"

tokenizer = BertTokenizer.from_pretrained(model_dir)

model = BertModel.from_pretrained(model_dir)

config = BertConfig.from_json_file(config_directory)

In [327]:
from langchain_community.document_loaders import TextLoader
PDF = "contract.txt"
loader = TextLoader(PDF)

In [328]:
output = loader.load()

In [329]:
print(output)

[Document(metadata={'source': 'contract.txt'}, page_content="Contract for Catering Services\nThe starting date of the contract is June 24, 2024.\nThe ending date of the contract is June 24, 2025.\nThe employer of the contract is HealthCare Solutions.\nThe vendor of the contract is Gourmet Delights Catering.\nThe department involved in the contract is the Events Department.\nThe service provided by Gourmet Delights Catering is menu planning for corporate events.\nThe service provided by Gourmet Delights Catering will prepare and serve food for corporate events.\nThe service provided by Gourmet Delights Catering will set up and clean up after corporate events.\nThe service provided by Gourmet Delights Catering will provide beverage service for corporate events.\nThe payment details are that HealthCare Solutions will pay Gourmet Delights Catering $10,000 per event.\nThe payment details are that Payments must be made within 10 days after each event.\nThe terms and conditions are that Gourm

In [330]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

class SentenceTextSplitter:
    def __init__(self, chunk_size, chunk_overlap):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

    def split_text(self, text):
        sentences = sent_tokenize(text)
        chunks = []
        current_chunk = []
        current_length = 0

        for sentence in sentences:
            sentence_length = len(sentence)
            if current_length + sentence_length > self.chunk_size:
                chunks.append(" ".join(current_chunk))
                current_chunk = current_chunk[-self.chunk_overlap:]
                current_length = sum(len(sent) for sent in current_chunk)
            current_chunk.append(sentence)
            current_length += sentence_length

        if current_chunk:
            chunks.append(" ".join(current_chunk))
        
        return chunks

splitter = SentenceTextSplitter(chunk_size=100, chunk_overlap=30)
chunks = splitter.split_text(output[0].page_content)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kevla\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [331]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100,chunk_overlap=30,separators=["\n\n","\n", "."])
splitted_text = text_splitter.split_documents(documents=output)
print(len(splitted_text))

19


In [332]:
print(splitted_text)

[Document(metadata={'source': 'contract.txt'}, page_content='Contract for Catering Services\nThe starting date of the contract is June 24, 2024.'), Document(metadata={'source': 'contract.txt'}, page_content='The ending date of the contract is June 24, 2025.'), Document(metadata={'source': 'contract.txt'}, page_content='The employer of the contract is HealthCare Solutions.'), Document(metadata={'source': 'contract.txt'}, page_content='The vendor of the contract is Gourmet Delights Catering.'), Document(metadata={'source': 'contract.txt'}, page_content='The department involved in the contract is the Events Department.'), Document(metadata={'source': 'contract.txt'}, page_content='The service provided by Gourmet Delights Catering is menu planning for corporate events.'), Document(metadata={'source': 'contract.txt'}, page_content='The service provided by Gourmet Delights Catering will prepare and serve food for corporate events.'), Document(metadata={'source': 'contract.txt'}, page_content

In [333]:
import torch
# Function to compute embeddings using BertModel
def compute_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    last_hidden_state = outputs.last_hidden_state
    embeddings = torch.mean(last_hidden_state, dim=1).squeeze().numpy()
    return embeddings

In [334]:
import faiss
import pickle
import numpy as np
def store_embeddings_in_faiss(splitted_text, index_filename):
    embedding_dim = 768  # Bert base model output size
    index = faiss.IndexFlatL2(embedding_dim)
    
    docstore = {}
    index_to_docstore_id = {}
    idx = 0
    
    for chunk in splitted_text:
        text = chunk.page_content
        embeddings = compute_bert_embeddings(text)
        embeddings = np.array(embeddings, dtype=np.float32).reshape(1, -1)
        index.add(embeddings)
        
        # Assuming chunk has a unique identifier like doc_id
        doc_id = f"doc_{idx}"
        docstore[doc_id] = {
            'text': text,
            'source': chunk.metadata['source']
        }
        index_to_docstore_id[idx] = doc_id
        idx += 1
    
    # Save FAISS index
    faiss.write_index(index, index_filename + ".faiss")
    
    # Save associated metadata (docstore and index_to_docstore_id)
    metadata = {
        'docstore': docstore,
        'index_to_docstore_id': index_to_docstore_id
    }
    with open(index_filename + ".pkl", "wb") as f:
        pickle.dump(metadata, f)


In [335]:
index_filename = "index"
store_embeddings_in_faiss(splitted_text, index_filename)

In [336]:
vector_store_directory = "index.faiss"
vector_store = faiss.read_index(vector_store_directory)
print(vector_store)

<faiss.swigfaiss.IndexFlat; proxy of <Swig Object of type 'faiss::IndexFlat *' at 0x00000212CA2032A0> >


In [337]:
class CustomRetriever:
    def __init__(self, index_filename):
        self.index = faiss.read_index(index_filename)

    def retrieve_documents(self, query, top_k=3):
        # Implement your retrieval logic here
        # Example using FAISS:
        embeddings = compute_bert_embeddings(query)  # Implement compute_bert_embeddings
        embeddings = np.array(embeddings, dtype=np.float32).reshape(1, -1)
        distances, indices = self.index.search(embeddings, top_k)
        print(distances, indices)
        reversed_indices = indices[0][::-1] 
        print(reversed_indices)
        reversed_indices_2d = np.expand_dims(reversed_indices, axis=0)
        return reversed_indices_2d

In [2]:
from transformers import AutoTokenizer, BertForQuestionAnswering

qna_tokenizer = AutoTokenizer.from_pretrained("./bert-for-qna")
qna_model = BertForQuestionAnswering.from_pretrained("./bert-for-qna")

In [339]:
import re
class Generator:
    def __init__(self, tokenizer, model, device="cpu"):
        self.tokenizer = tokenizer
        self.model = model
        self.device = device

    def generate(self, retrieved_documents, query):
        contexts = [doc.page_content for doc in retrieved_documents]
        combined_context = " ".join(contexts)

        combined_context = re.sub(r'\n\s*\n', '\n\n', combined_context).strip()

        question = query
        text = combined_context
        
        print(text)
        inputs = self.tokenizer(question, text, return_tensors="pt")

        with torch.no_grad():
            outputs = self.model(**inputs)

        # Get the predicted start and end indices for the answer
        answer_start_index = outputs.start_logits.argmax()
        answer_end_index = outputs.end_logits.argmax()

        # Extract the tokens corresponding to the predicted answer span
        predicted_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]

        # Decode the tokens to get the predicted answer text
        predicted_answer = self.tokenizer.decode(predicted_answer_tokens, skip_special_tokens=True)
        print("Predicted Answer:", predicted_answer)

        # Define the target answer indices (for loss calculation)
        target_start_index = torch.tensor([14])  # Modify as per your target answer start index
        target_end_index = torch.tensor([15])    # Modify as per your target answer end index

        # Compute the outputs including the loss
        outputs = self.model(**inputs, start_positions=target_start_index, end_positions=target_end_index)
        loss = outputs.loss
        print("Loss:", round(loss.item(), 2))

        # For seeing the actual target text
        target_answer_tokens = inputs.input_ids[0, target_start_index : target_end_index + 1]
        target_answer = self.tokenizer.decode(target_answer_tokens, skip_special_tokens=True)
        print("Target Answer:", target_answer)

In [389]:
index_filename = "index.faiss"
custom_retriever = CustomRetriever(index_filename)
query = "When is the contract starting?"
top_k_docs = custom_retriever.retrieve_documents(query)
print("Top-k document indices:", top_k_docs)

[[59.90132  64.599075 65.593506]] [[ 0  2 12]]
[12  2  0]
Top-k document indices: [[12  2  0]]


In [390]:
retrieved_documents = [splitted_text[i] for i in top_k_docs[0]]
print(retrieved_documents)

[Document(metadata={'source': 'contract.txt'}, page_content='The payment details are that Payments must be made within 10 days after each event.'), Document(metadata={'source': 'contract.txt'}, page_content='The employer of the contract is HealthCare Solutions.'), Document(metadata={'source': 'contract.txt'}, page_content='Contract for Catering Services\nThe starting date of the contract is June 24, 2024.')]


In [391]:
generator = Generator(qna_tokenizer,qna_model)

In [392]:
generator.generate(retrieved_documents=retrieved_documents,query=query)

The payment details are that Payments must be made within 10 days after each event. The employer of the contract is HealthCare Solutions. Contract for Catering Services
The starting date of the contract is June 24, 2024.
Predicted Answer: June 24, 2024
Loss: 15.68
Target Answer: ##yment
