In [1]:
pip install nltk sentence-transformers pymilvus rank-bm25 transformers streamlit numpy grpcio


Note: you may need to restart the kernel to use updated packages.


In [30]:
!pip install torch



In [31]:
import torch

In [2]:
import nltk
import numpy as np
from sentence_transformers import SentenceTransformer, util

# Load a pre-trained SentenceTransformer model
nltk.download('punkt')
model = SentenceTransformer('all-MiniLM-L6-v2')

def chunk_data(text):
    sentences = nltk.sent_tokenize(text)
    embeddings = model.encode(sentences, convert_to_tensor=True)

    # Group sentences based on semantic similarity
    clusters = util.community_detection(embeddings, min_community_size=2)

    chunks = []
    for cluster in clusters:
        chunk = " ".join([sentences[idx] for idx in cluster])
        chunks.append(chunk)

    return chunks

# Example usage
# with open('merged_texts.txt', 'r',encoding="utf8") as file:
#     text = file.read()
# chunks = chunk_data(text)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shubh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
import pickle
with open('chunks.pkl', 'rb') as f:
    chunks = pickle.load(f)

In [9]:
from pymilvus import connections, utility, FieldSchema, CollectionSchema, DataType, Collection

# Connect to Milvus
if "default" in connections.list_connections():
    connections.disconnect("default")
print(connections.list_connections())
connections.connect("new_connection", host='127.0.0.1', port='19530')

# Define collection schema
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=384),
    FieldSchema(name="metadata", dtype=DataType.VARCHAR, max_length=255)  # Use VarChar instead of String
]
schema = CollectionSchema(fields, description="chunk collection")

# Create collection
collection = Collection("chunks", schema)

# Insert data
embeddings = model.encode(chunks)
metadata = ["chunk metadata" for _ in chunks]
entities = [
    [i for i in range(len(chunks))],
    embeddings,
    metadata
]
collection.insert(entities)

[('default', <pymilvus.client.grpc_handler.GrpcHandler object at 0x00000261AB8A4E50>), ('new_connection', <pymilvus.client.grpc_handler.GrpcHandler object at 0x00000261C89575E0>)]


Batches:   0%|          | 0/277 [00:00<?, ?it/s]

(insert count: 8853, delete count: 0, upsert count: 0, timestamp: 451193072561946628, success count: 8853, err count: 0)

In [40]:
from rank_bm25 import BM25Okapi
from transformers import DPRContextEncoder, DPRQuestionEncoder, DPRReader, DPRContextEncoderTokenizer, DPRQuestionEncoderTokenizer

# BM25 Retrieval
bm25 = BM25Okapi([nltk.word_tokenize(chunk) for chunk in chunks])

# DPR Retrieval
context_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

def split_into_subchunks(text, max_length):
    sentences = nltk.sent_tokenize(text)
    subchunks = []
    current_subchunk = []

    for sentence in sentences:
        current_subchunk.append(sentence)
        if len(" ".join(current_subchunk)) > max_length:
            subchunks.append(" ".join(current_subchunk[:-1]))
            current_subchunk = [current_subchunk[-1]]

    if current_subchunk:
        subchunks.append(" ".join(current_subchunk))

    return subchunks

Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the 

In [41]:
def retrieve(query):
    question_inputs = question_tokenizer(query, return_tensors='pt')
    question_embedding = question_encoder(**question_inputs).pooler_output

    bm25_scores = bm25.get_scores(nltk.word_tokenize(query))

    dpr_scores = []
    for chunk in chunks:
        subchunks = split_into_subchunks(chunk, 512)
        chunk_embedding = torch.mean(torch.stack([context_encoder(**context_tokenizer(subchunk, return_tensors='pt')).pooler_output for subchunk in subchunks]), dim=0)
        score = util.pytorch_cos_sim(question_embedding, chunk_embedding)
        dpr_scores.append(score.item())  # Ensure score is a scalar

    hybrid_scores = [bm25_scores[i] + dpr_scores[i] for i in range(len(chunks))]
    ranked_indices = np.argsort(hybrid_scores)[::-1]
    return [chunks[idx] for idx in ranked_indices[:10]]

In [42]:
from transformers import pipeline

qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

def answer_question(query):
    retrieved_chunks = retrieve(query)
    print(f"Retrieved chunks: {retrieved_chunks}")  # Debugging print
    answers = [qa_pipeline(question=query, context=chunk) for chunk in retrieved_chunks]
    print(f"Answers: {answers}")  # Debugging print
    return sorted(answers, key=lambda x: x['score'], reverse=True)[0]['answer']

In [44]:

input1 = "What is navidia?"
answer = answer_question(input1)
print(answer)

RuntimeError: [enforce fail at C:\b\abs_bao0hdcrdh\croot\pytorch_1675190257512\work\c10\core\impl\alloc_cpu.cpp:81] data. DefaultCPUAllocator: not enough memory: you tried to allocate 8710848 bytes.

In [None]:
# import streamlit as st

# st.title("Question Answering System")

# query = st.text_input("Enter your query:")
# if query:
#     answer = answer_question(query)
#     st.write("Answer:", answer)

In [39]:
from transformers import pipeline
import torch
from transformers import AutoTokenizer, AutoModel

# Load the context encoder and tokenizer
context_tokenizer = AutoTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
context_encoder = AutoModel.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

def split_into_subchunks(text, max_length):
    words = text.split()
    subchunks = [' '.join(words[i:i+max_length]) for i in range(0, len(words), max_length)]
    return subchunks

def retrieve(query):
    chunks = [
        "This is a long text chunk number 1.",
        "This is a long text chunk number 2.",
        # Add more chunks as needed
    ]
    question_tokenizer = AutoTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
    question_encoder = AutoModel.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

    question_embedding = question_encoder(**question_tokenizer(query, return_tensors='pt')).pooler_output
    dpr_scores = []
    
    for chunk in chunks:
        subchunks = split_into_subchunks(chunk, 512)
        subchunk_embeddings = []
        for subchunk in subchunks:
            inputs = context_tokenizer(subchunk, return_tensors='pt')
            with torch.no_grad():
                subchunk_embeddings.append(context_encoder(**inputs).pooler_output)
        chunk_embedding = torch.mean(torch.stack(subchunk_embeddings), dim=0)
        score = torch.cosine_similarity(question_embedding, chunk_embedding)
        dpr_scores.append(score.item())
    
    return [chunk for _, chunk in sorted(zip(dpr_scores, chunks), reverse=True)]

def answer_question(query):
    retrieved_chunks = retrieve(query)
    print(f"Retrieved chunks: {retrieved_chunks}")  # Debugging print
    answers = [qa_pipeline(question=query, context=chunk) for chunk in retrieved_chunks]
    print(f"Answers: {answers}")  # Debugging print
    return sorted(answers, key=lambda x: x['score'], reverse=True)[0]['answer']

input1 = "What is navidia?"
answer = answer_question(input1)
print(answer)


Some weights of DPRQuestionEncoder were not initialized from the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base and are newly initialized: ['bert_model.embeddings.LayerNorm.bias', 'bert_model.embeddings.LayerNorm.weight', 'bert_model.embeddings.position_embeddings.weight', 'bert_model.embeddings.token_type_embeddings.weight', 'bert_model.embeddings.word_embeddings.weight', 'bert_model.encoder.layer.0.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.0.attention.output.LayerNorm.weight', 'bert_model.encoder.layer.0.attention.output.dense.bias', 'bert_model.encoder.layer.0.attention.output.dense.weight', 'bert_model.encoder.layer.0.attention.self.key.bias', 'bert_model.encoder.layer.0.attention.self.key.weight', 'bert_model.encoder.layer.0.attention.self.query.bias', 'bert_model.encoder.layer.0.attention.self.query.weight', 'bert_model.encoder.layer.0.attention.self.value.bias', 'bert_model.encoder.layer.0.attention.self.value.weight', 'bert_model.encoder.layer.0.i

model.safetensors:  51%|#####     | 252M/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Retrieved chunks: ['This is a long text chunk number 2.', 'This is a long text chunk number 1.']
Answers: [{'score': 4.345945853856392e-05, 'start': 15, 'end': 34, 'answer': 'text chunk number 2'}, {'score': 2.8306727472227067e-05, 'start': 15, 'end': 34, 'answer': 'text chunk number 1'}]
text chunk number 2


'cpu'