## TODO 
- maybe some better model for embedding extraction
- better prompt for the chatbot 
- somehow test the implementation
- Adding citations to score the papers
- somehow separate the user query and searching for papers on arxiv

## Generate a response by incoroprating the retrieved papers with a chatbot

## Larger model

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig, AutoConfig
import torch

# Load a chat-capable model
LLM_MODEL = "mistralai/Mistral-7B-Instruct-v0.2"
device = f"cuda:{torch.cuda.current_device()}"

tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=LLM_MODEL,
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# 8-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, # loading in 4 bit
    bnb_4bit_quant_type="nf4", # quantization type
    bnb_4bit_use_double_quant=True, # nested quantization
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model_config = AutoConfig.from_pretrained(
    pretrained_model_name_or_path=LLM_MODEL,
)
model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=LLM_MODEL,
    config=model_config,
    quantization_config=bnb_config, # we introduce the bnb config here.
    device_map="auto",
)
model.eval()

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 3/3 [00:27<00:00,  9.33s/it]


MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): Mist

In [2]:
from transformers import pipeline
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain_core.prompts import PromptTemplate

# Define the Hugging Face pipeline for text generation
generate_text = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    max_new_tokens=8192,
    repetition_penalty=1.1,
)

# Wrap the Hugging Face pipeline into LangChain's LLM
llm = HuggingFacePipeline(pipeline=generate_text)

template = """
You are a helpful AI QA assistant, for answering querries about research methods.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Question: {question}
Answer:"""
prompt = PromptTemplate.from_template(template)

Device set to use cuda:0
  llm = HuggingFacePipeline(pipeline=generate_text)


In [3]:
# an example of something that works without rag
chain = prompt | llm
question = "Which paper introduced the transformer architecture"
print(chain.invoke({"question": question}))


You are a helpful AI QA assistant, for answering querries about research methods.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Question: Which paper introduced the transformer architecture
Answer: The transformer architecture was introduced in the paper "Attention is All You Need" by Vaswani et al., published in 2017.


In [4]:
# A newer one 
question = ""
print(chain.invoke({"question": question}))


You are a helpful AI QA assistant, for answering querries about research methods.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Question: 
Answer: I'd be happy to help answer any questions you have about research methods! However, your question is not specific enough for me to provide a clear answer. Could you please specify which research method or methods you are inquiring about? Some common research methods include surveys, experiments, case studies, and literature reviews. Once I have more information, I can provide a more accurate response.


## Get the papers based on the user query

In [8]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer 
import arxiv 

# Load the sentence transformer model
# TODO maybe some better model for embedding extraction, maybe we could fine tune?
# TODO maybe somehow add citations
model_embeddings = SentenceTransformer('all-MiniLM-L6-v2')  # For embeddin extraction

# Define your query
user_query = "Which paper introduced the transformer architecture"

# Get the embedding for the query
query_embedding = model_embeddings.encode([user_query])

search = arxiv.Search(
    query=user_query,
    max_results=50,
    sort_by=arxiv.SortCriterion.Relevance,
    sort_order=arxiv.SortOrder.Descending
)

client = arxiv.Client()
results = list(client.results(search))

# Extract summaries and titles
papers = []
summaries = []
for result in results:
    title = result.title
    authors = ', '.join([author.name for author in result.authors])
    summary = result.summary
    url = f"https://arxiv.org/abs/{result.entry_id.split('/')[-1]}"
    papers.append({
        "title": title,
        "authors": authors,
        "summary": summary,
        "url": url
    })
    summaries.append(summary)

# Encode all summaries
summary_embeddings = model_embeddings.encode(summaries)

# Compute cosine similarities
similarities = cosine_similarity(query_embedding, summary_embeddings)[0]

for i, paper in enumerate(papers):
    paper["similarity"] = similarities[i]


top_papers = sorted(papers, key=lambda x: x["similarity"], reverse=True)[:5] # top 5

# Print top 5 similar papers
for i, paper in enumerate(top_papers, 1):
    print(f"Rank #{i}")
    print(f"Title: {paper['title']}")
    print(f"Authors: {paper['authors']}")
    print(f"Summary: {paper['summary']}")
    print(f"Similarity: {paper['similarity']:.4f}")
    print(f"URL: {paper['url']}")
    print("-" * 80)


Rank #1
Title: Neuromodulation Gated Transformer
Authors: Kobe Knowles, Joshua Bensemann, Diana Benavides-Prado, Vithya Yogarajan, Michael Witbrock, Gillian Dobbie, Yang Chen
Summary: We introduce a novel architecture, the Neuromodulation Gated Transformer
(NGT), which is a simple implementation of neuromodulation in transformers via
a multiplicative effect. We compare it to baselines and show that it results in
the best average performance on the SuperGLUE benchmark validation sets.
Similarity: 0.5207
URL: https://arxiv.org/abs/2305.03232v2
--------------------------------------------------------------------------------
Rank #2
Title: Interpretation of the Transformer and Improvement of the Extractor
Authors: Zhe Chen
Summary: It has been over six years since the Transformer architecture was put
forward. Surprisingly, the vanilla Transformer architecture is still widely
used today. One reason is that the lack of deep understanding and comprehensive
interpretation of the Transformer ar

## Combine the retrieved papers and the generation model

In [None]:
from langchain.chains import LLMChain

# Combine summaries into a context string
context = "\n\n".join(
    f"Title: {paper['title']}\nSummary: {paper['summary']}" for paper in top_papers
)

PROMPT_TEMPLATE = """
You are a helpful AI QA assistant, for answering querries about research methods.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

```
{context}
```

### Question:
{question}

### Answer:
"""

prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template=PROMPT_TEMPLATE.strip(),
)

# Define the Hugging Face pipeline for text generation
generate_text = pipeline(
    task="text-generation",
    model=model,  # Replace with your model
    tokenizer=tokenizer,  # Replace with your tokenizer
    return_full_text=True,
    max_new_tokens=8192,
    repetition_penalty=1.1,
)

# Wrap the Hugging Face pipeline into LangChain's LLM
llm = HuggingFacePipeline(pipeline=generate_text)

# Create the LLMChain with the prompt template and the LLM
qa_chain = LLMChain(prompt=prompt_template, llm=llm)

# Ask the model a question and get the answer
question = user_query # TODO maybe change this, to make it different than the search (or change the search)
response = qa_chain.run({"context": context, "question": question})

# Print the response
print("Answer:", response)

Device set to use cuda:0


Answer: You are a helpful AI QA assistant, for answering querries about research methods.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

```
Title: Neuromodulation Gated Transformer
Summary: We introduce a novel architecture, the Neuromodulation Gated Transformer
(NGT), which is a simple implementation of neuromodulation in transformers via
a multiplicative effect. We compare it to baselines and show that it results in
the best average performance on the SuperGLUE benchmark validation sets.

Title: Interpretation of the Transformer and Improvement of the Extractor
Summary: It has been over six years since the Transformer architecture was put
forward. Surprisingly, the vanilla Transformer architecture is still widely
used today. One reason is that the lack of deep understanding and comprehensive
interpretation of the Transformer architecture makes it more challenging to
improve the Transformer architecture. In this paper, we first interpret 

## Simpler model

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

model_id = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

rag = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

# Combine summaries into a context string, but make sure it's within the token limit
context = "\n\n".join(
    f"Title: {paper['title']}\nSummary: {paper['summary']}" for paper in top_papers
)

# Encode the context and check its length
input_ids = tokenizer.encode(context, return_tensors="pt")
max_length = 1700  # Adjust this based on your model's max token length

# Truncate if necessary to fit within the max token limit
if input_ids.shape[1] > max_length:
    input_ids = input_ids[:, :max_length]


# Prepare the prompt, ensuring it stays within the token limit
prompt = f"""Here are some research papers:

{context[:max_length]}  # Only include a truncated context if necessary

Use the above research paper summaries to answer the following question:

Question: {user_query}
Answer:"""

# Generate the answer using the same prompt
output = rag(prompt, max_new_tokens=300)

# Provide the generated answer along with the papers
print("Research Papers and Generated Answer:")
print(f"Research Papers:\n{context[:max_length]}")  # Display truncated context
print(f"Generated Answer:\n{output[0]['generated_text']}")
