In [3]:
!pip install faiss-gpu transformers sentence-transformers
!pip install markdown2

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu, sentence-transformers
Successfully installed faiss-gpu-1.7.2 sentence-transformers-3.0.1
Collecting markdown2
  Downloading markdown2-2.4.13-py2.py3-none-any.whl.metadata (2.0 kB)
Downloading markdown2-2.4.13-py2.py3-none-any.whl (41 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
import json
from markdown2 import markdown
from IPython.display import display, Markdown

# Load your JSON data
with open('scraped_datav1.json', 'r') as f:
    data = json.load(f)

# # Prepare data for embedding and retrieval
keys = list(data.keys())
texts = []

for content in data.values():
    paragraphs = content.get('paragraphs', [])
    ordered_lists = sum(content.get('ordered_lists', []), [])
    unordered_lists = sum(content.get('unordered_lists', []), [])
    tables = sum(content.get('tables', []), [])
    links = sum(content.get('links', []), [])
    equations = content.get('equations', [])
    
    # Concatenate all text elements
    text_content = " ".join(paragraphs + ordered_lists + unordered_lists + tables + equations + links)
    
#     keys.append(content['title'])  # Assuming 'title' is a key in your scraped data for section titles
    texts.append(text_content)

print(f"Loaded {len(keys)} pieces of text from the JSON data.")


Loaded 565 pieces of text from the JSON data.


In [5]:
# texts

In [6]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to get embeddings
def get_embeddings(texts):
    return model.encode(texts)

# Generate embeddings for texts and keys
text_embeddings = get_embeddings(texts)
key_embeddings = get_embeddings(keys)
print(f"Generated embeddings for {len(text_embeddings)} pieces of text and {len(key_embeddings)} keys.")


  from tqdm.autonotebook import tqdm, trange
2024-06-14 15:07:10.556340: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-14 15:07:10.556519: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-14 15:07:10.761015: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Generated embeddings for 565 pieces of text and 565 keys.


In [7]:
import faiss
import numpy as np

# Convert embeddings to numpy array
text_embeddings = np.array(text_embeddings)
key_embeddings = np.array(key_embeddings)

# Initialize FAISS index for texts and keys
text_index = faiss.IndexFlatL2(text_embeddings.shape[1])
key_index = faiss.IndexFlatL2(key_embeddings.shape[1])

text_index.add(text_embeddings)
key_index.add(key_embeddings)

# Save indices
faiss.write_index(text_index, 'text_vector_db.index')
faiss.write_index(key_index, 'key_vector_db.index')


In [8]:
import faiss
import numpy as np
from collections import defaultdict

# Assuming get_embeddings and data are already defined somewhere in your code

def process_query(query, key_index, top_k=1):
    # Get query embedding
    query_embedding = get_embeddings([query])[0].reshape(1, -1)

    # Search for similar keys
    _, key_I = key_index.search(query_embedding, top_k)
    
    # Retrieve the most relevant keys and their corresponding texts
    result_keys = []
    combined_result_content = defaultdict(list)

    for idx in key_I[0]:
        result_key = keys[idx]
        result_keys.append(result_key)
        result_content = data[result_key]
        
        for key, value in result_content.items():
            if isinstance(value, list):
                combined_result_content[key].extend(value)
            elif isinstance(value, dict):
                for sub_key, sub_value in value.items():
                    combined_result_content[key][sub_key].extend(sub_value)
            else:
                combined_result_content[key].append(value)

    # Convert defaultdict to a regular dictionary for the final result
    combined_result_content = dict(combined_result_content)

    combined_result_keys = ', '.join(result_keys)
    
    return combined_result_keys, combined_result_content

# Load FAISS indices
key_index = faiss.read_index('key_vector_db.index')
# text_index is not used in the current implementation, so it's removed

# Example usage
query = "What is History of LLM?"
result_keys, result_content = process_query(query, key_index)
print(f"Relevant sections: {result_keys}")
print()
print("Content:", result_content)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Relevant sections: in-context learning, In-context learning

Content: {'paragraphs': ['In-context learning. Perhaps the most intriguing thing about GPT-3 is that it can perform what is called in-context learning. Let’s start with an example (demo):', 'One can prompt a language model to generate a news article based on a headline (demo). Here is an example of an article that GPT-3 fabricated (everything after the bolded text):', 'We (i) see that the answer given by GPT-3 is not the most informative and (ii) perhaps want the answer directly rather than a full sentence.', 'In-context learning. Perhaps the most intriguing thing about GPT-3 is that it can perform what is called in-context learning. Let’s start with an example (demo):', 'One can prompt a language model to generate a news article based on a headline (demo). Here is an example of an article that GPT-3 fabricated (everything after the bolded text):', 'We (i) see that the answer given by GPT-3 is not the most informative and (ii

In [9]:
from transformers import pipeline

# Load a text generation pipeline (e.g., GPT-2)
generator = pipeline('text-generation', model='gpt2')

def generate_structured_response(query, result_key, result_content):
    # Create a structured prompt
    prompt = f"Question: {query}\n\n"
    prompt += f"Section: {result_key}\n\n"
    
    # Add content to the prompt
    if result_content.get('paragraphs'):
        prompt += "Paragraphs:\n" + "\n".join(result_content['paragraphs']) + "\n\n"
    if result_content.get('ordered_lists'):
        prompt += "Ordered Lists:\n" + "\n".join(["\n".join(ol) for ol in result_content['ordered_lists']]) + "\n\n"
    if result_content.get('unordered_lists'):
        prompt += "Unordered Lists:\n" + "\n".join(["\n".join(ul) for ul in result_content['unordered_lists']]) + "\n\n"
    if result_content.get('tables'):
        prompt += "Tables:\n" + "\n".join(["\n".join(table) for table in result_content['tables']]) + "\n\n"
    if result_content.get('links'):
        prompt += "Links:\n" + "\n".join(result_content['links']) + "\n\n"
    if result_content.get('equations'):
        prompt += "Equations:\n" + "\n".join(result_content['equations']) + "\n\n"
    
    # Add a closing statement
    prompt += "Answer is :"
    
    # Generate response
    response = generator(prompt[:300], max_length=300, num_return_sequences=1,truncation=True,pad_token_id=50256)
    generated_text = response[0]['generated_text']

    return generated_text


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
# Generate structured response
query = "What is a language model?"
result_key, result_content = process_query(query, key_index)

# Generate structured response
generated_text = generate_structured_response(query, result_key, result_content)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
print(generated_text)