<a href="https://colab.research.google.com/github/Otabek-Rizayev/Aiogram_Template/blob/main/AsyncGPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -qU datasets pinecone-client sentence-transformers torch

In [None]:
from datasets import load_dataset

wiki_data = load_dataset(
    'vblagoje/wikipedia_snippets_streamed',
    split='train',
    streaming=True
).shuffle(seed=960)

In [None]:
next(iter(wiki_data))

In [None]:
history = wiki_data.filter(
    lambda d: d['section_title'].startswith('History')
)

In [None]:
from tqdm.auto import tqdm

total_doc_count = 50000
counter = 0
docs = []
for d in tqdm(history, total=total_doc_count):
  doc = {
      "article_title": d["article_title"],
      "section_title": d["section_title"],
      "passage_text": d["passage_text"]
  }
  docs.append(doc)
  if counter == total_doc_count:
    break
  counter += 1

In [None]:
import pandas as pd

df = pd.DataFrame(docs)
df.head()

In [None]:
import pinecone
pinecone.init(
    api_key="3adbd7f7-8619-4352-9d13-d54947725b9a",
    environment="us-west4-gcp-free"
)

In [None]:
index_name = "abstractive-question-answering"

if index_name not in pinecone.list_indexes():
  pinecone.create_index(
      index_name,
      dimension=768,
      metric="cosine",
  )
index = pinecone.Index(index_name)

In [None]:
from sentence_transformers import SentenceTransformer
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
retriever = SentenceTransformer(
    "flax-sentence-embeddings/all_datasets_v3_mpnet-base",
    device=device
)
retriever

In [None]:
batch_size = 64

for i in tqdm(range(0, len(df), batch_size)):
  i_end = min(i+batch_size, len(df))
  batch = df.iloc[i:i_end]
  emb = retriever.encode(batch["passage_text"].tolist()).tolist()
  meta = batch.to_dict(orient="records")
  ids = [f"{idx}" for idx in range(i, i_end)]
  to_upsert = list(zip(ids, emb, meta))
  _ = index.upsert(vectors=to_upsert)

index.describe_index_stats()


In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration

tokenizer = BartTokenizer.from_pretrained("vblagoje/bart_lfqa")
generator = BartForConditionalGeneration.from_pretrained("vblagoje/bart_lfqa")

In [None]:
def query_pinecone(query, top_k):
  xq = retriever.encode([query]).tolist()
  xc = index.query(xq, top_k=top_k, include_metadata=True)
  return xc

In [None]:
def format_query(query, context):
  context = [f"<P> {m['metadata']['passage_text']}" for m in context]
  context = " ".join(context)
  query = f"Question: {query}\n context: {context}"
  return query

In [None]:
query = "What is it Python?"
result = query_pinecone(query, top_k=1)
result

In [None]:
from pprint import pprint
query = format_query(query, result['matches'])
pprint(query)

In [None]:
def generate_answer(query):
  inputs = tokenizer([query], max_length=1024, return_tensors='pt')
  ids = generator.generate(
      inputs['input_ids'], num_beams=2, min_length=20, max_length=40
  )
  answer = tokenizer.batch_decode(ids, skip_special_tokens=True, clean_up_tokenization=False)[0]
  return pprint(answer)

In [None]:
generate_answer(query)

In [None]:
query = "What is Elon Musk's first project?"
context = query_pinecone(query, top_k=5)
query = format_query(query, context['matches'])
generate_answer(query)

In [None]:
for doc in context["matches"]:
  print(doc["metadata"]["passage_text"], end='\n---\n')


In [None]:
query = "What was Nasas most expensive project?"
context = query_pinecone(query, top_k=3)
query = format_query(query, context["matches"])
generate_answer(query)