In [2]:
!pip install pypdf
!pip install langchain langchain_community transformers chromadb openai

Collecting pypdf
  Downloading pypdf-4.2.0-py3-none-any.whl (290 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/290.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━[0m [32m194.6/290.4 kB[0m [31m5.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.4/290.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pypdf
Successfully installed pypdf-4.2.0
Collecting langchain
  Downloading langchain-0.2.5-py3-none-any.whl (974 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m974.6/974.6 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain_community
  Downloading langchain_community-0.2.5-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
Collecting chromadb
  Downloading chromadb-0.5.3-py3-none-any

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline, BertTokenizer, BertModel
import os
import torch.nn as nn
import torch.optim as optim
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate
import chromadb
from langchain.chat_models import ChatOpenAI
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW
from chromadb.config import Settings
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction


In [46]:
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        item = {key: val.squeeze() for key, val in encoding.items()}
        item['labels'] = torch.zeros(768)
        return item


In [47]:
class EmbeddingModel(torch.nn.Module):
    def __init__(self, model_name='bert-base-uncased'):
        super(EmbeddingModel, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state[:, 0, :]

    def encode(self, texts, tokenizer, max_length=512):
        self.eval()
        encodings = []
        with torch.no_grad():
            for text in texts:
                encoding = tokenizer(
                    text,
                    padding='max_length',
                    truncation=True,
                    max_length=max_length,
                    return_tensors='pt'
                )
                input_ids = encoding['input_ids'].to(device)
                attention_mask = encoding['attention_mask'].to(device)
                embedding = self.forward(input_ids, attention_mask)
                encodings.append(embedding.cpu().numpy().flatten())
        return encodings


In [14]:
class AnswerGenerationModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.model = GPT2LMHeadModel.from_pretrained('gpt2')

    def forward(self, input_ids):
        outputs = self.model(input_ids, labels=input_ids)
        loss = outputs.loss
        logits = outputs.logits
        return loss, logits

    def generate(self, **inputs):
        generated_ids = self.model.generate(**inputs)
        return generated_ids


In [15]:
def train_embedding_model(data_loader, model, optimizer, criterion, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in data_loader:
            optimizer.zero_grad()
            inputs = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(inputs, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        print(f'Epoch {epoch + 1}, Loss: {total_loss / len(data_loader)}')


In [16]:
def train_generation_model(data_loader, model, optimizer, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in data_loader:
            optimizer.zero_grad()
            inputs = batch['input_ids'].to(device)

            # Model forward pass
            loss, _ = model(inputs)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        print(f'Epoch {epoch + 1}, Loss: {total_loss / len(data_loader)}')


In [18]:
def save_model(model, path):
    torch.save(model.state_dict(), path)

def load_model(model, path):
    model.load_state_dict(torch.load(path))
    model.to(device).eval()


In [34]:
def retrive_topK(query=[], topk=5):
    client = chromadb.PersistentClient(path="/content/here")
    collection = client.get_or_create_collection(name="test")

    input_texts = [tokenizer(text, return_tensors='pt', padding=True, truncation=True) for text in query]
    query_embeddings = []
    with torch.no_grad():
        for inputs in input_texts:
            inputs = {k: v.to(device) for k, v in inputs.items() if k in ['input_ids', 'attention_mask']}
            vec = embedding_model(**inputs).cpu().numpy()
            query_embeddings.append(vec)

    query_embeddings = np.vstack(query_embeddings)
    results = collection.query(
        query_embeddings=query_embeddings,
        n_results=topk
    )

    # Flatten the list of lists
    context_texts = [doc for sublist in results['documents'] for doc in sublist]
    return context_texts


In [54]:
def generate_answer_from_context_and_query(prompt_template, model, tokenizer, context_text, query):
    input_text = prompt_template.format(context=context_text, question=query)
    inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True).to(device)
    outputs = model.generate(**inputs, max_length=512, temperature=0.7,  # Control the randomness
    top_p=0.9,  # Use nucleus sampling
    num_beams=5,  # Use beam search for better results
    repetition_penalty=1.2)  # Adjust max_length if necessary
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [44]:
from langchain.prompts import PromptTemplate
PROMPT_TEMPLATE ="""
<|system|>
You are a great AI which can retrive answer from a given context very concisely. If the answer is not explicitly present in the context, you need to provide a summarization of the points referencing the answer mentioned in the context.


<|user|>
Answer:
{context}

"""

prompt_template = PromptTemplate(input_variables=["context", "question"], template=PROMPT_TEMPLATE)




In [23]:
if __name__ == "__main__":
    DOC_PATH = "/content/text.pdf"
    CHROMA_PATH = "your_db_name"

    # Load the PDF document and split into chunks
    loader = PyPDFLoader(DOC_PATH)
    pages = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = text_splitter.split_documents(pages)
    texts = [chunk.page_content for chunk in chunks]

    # Initialize tokenizer and create dataset
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    dataset = TextDataset(texts, tokenizer)
    data_loader = DataLoader(dataset, batch_size=8, shuffle=True)

    # Initialize models
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    embedding_model = EmbeddingModel().to(device)
    generation_model = AnswerGenerationModel().to(device)

    # Train embedding model
    optimizer = AdamW(embedding_model.parameters(), lr=5e-5)
    criterion = torch.nn.MSELoss()
    train_embedding_model(data_loader, embedding_model, optimizer, criterion, epochs=5)
    save_model(embedding_model, 'embedding_model.pth')

    # Train generation model


Epoch 1, Loss: 0.08131159317087043
Epoch 2, Loss: 0.06897458230907266
Epoch 3, Loss: 0.06750861030410636
Epoch 4, Loss: 0.06618079221384092
Epoch 5, Loss: 0.06491351483220403


In [24]:
optimizer = AdamW(generation_model.parameters(), lr=5e-5)
train_generation_model(data_loader, generation_model, optimizer, epochs=5)
save_model(generation_model, 'generation_model.pth')

# Load the trained models
load_model(embedding_model, 'embedding_model.pth')
load_model(generation_model, 'generation_model.pth')

# Create Chroma client and index data
client = chromadb.PersistentClient(path="/content/here")
collection = client.get_or_create_collection(name="test")


Epoch 1, Loss: 1.767633551901037
Epoch 2, Loss: 1.3962603346867994
Epoch 3, Loss: 1.2982900007204576
Epoch 4, Loss: 1.254174917936325
Epoch 5, Loss: 1.2185919786041433


In [26]:
whole_script=[chunks[i].page_content for i in range(len(chunks))]

In [27]:
import numpy as np
inp = [{'text': text} for text in whole_script]
# input_texts = [tokenizer(text['text'], return_tensors='pt', padding=True, truncation=True) for text in inp]
input_texts1 = [tokenizer(text['text'], return_tensors='pt', padding=True, truncation=True) for text in inp]

vecs = []
with torch.no_grad():
    for inputs in input_texts1:
        inputs = {k: v.to(device) for k, v in inputs.items() if k in ['input_ids', 'attention_mask']}
        vec = embedding_model(**inputs).cpu().numpy()
        vecs.append(vec)


vecs = np.vstack(vecs)
ids = ["text_" + str(j + 1) for j in range(len(texts))]
collection.add(
    embeddings=vecs,
    documents=texts,
    ids=ids
)



In [55]:
context_texts = retrive_topK(["Explain the theme of the movie?"])
context_text = " ".join(context_texts) if context_texts else ""

query = "Explain the theme of the movie?"
generation_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Add padding token to the tokenizer
generation_tokenizer.pad_token = generation_tokenizer.eos_token

answer = generate_answer_from_context_and_query(prompt_template, generation_model, generation_tokenizer, context_text, query)
print(answer)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



<|system|>
You are a great AI which can retrive answer from a given context very concisely. If the answer is not explicitly present in the context, you need to provide a summarization of the points referencing the answer mentioned in the context.


<|user|>
Answer:
there anything unusual about how you found her?  To warrant an official investigation? 27. everything.The Pilotfish rises to take pictures of the farm. 6. musical.  Somewhere between performance art and programming. 57. me.JOIIt’s okay to dream a little, isn’t it?KNot for us. 44. Luv sits in a formal business meeting pose across ANOTHER WOMAN who sips tea across from her.  A HOLO PROJECTION of someone off-world.LUVYou can customize them as much as you’d like.  “As human as you want them to be.”  But the Placers is strictly a drill site, isn’t it? Off-world mining rock wants a strong back and an utter lack of self-sufficiency, I wouldn’t waste your money on intelligence or attachment or appeal.  Unless you’d like to add some

**Explain the theme of the movie?**

there anything unusual about how you found her?  To warrant an official investigation? 27. The scenes of death and decay, as well as the references to pain and fragility, suggest a contemplation of mortality and the human condition.Technology and its impact on society: The presence of advanced technology, such as replicants and the Spinner, suggests a commentary on the role of technology in society and its potential consequences. Deception and perception: The use of deception and misdirection, such as the false leads and hidden agendas, suggests a focus on deception and perception.  But the Placers is strictly a drill site, isn’t it? OJustice and morality: The conflict between Deckard and the replicants raises questions about justice and morality, particularly in a society where replicants are created to serve humans.
