In [1]:
!pip install tokenizer openai==0.28 nltk langchain-community tiktoken langchain streamlit transformers
import nltk
nltk.download("stopwords")



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
import pandas as pd
import openai
from utils import *
import utils
import constants
from sklearn.metrics.pairwise import cosine_similarity


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
openai.api_key = constants.OPENAI_API_KEY

### Query S2 paper index

In [12]:
query = "How RAG works and how it impacts AI ?"

# json to pandas dataframe
search_results = search(preprocess_query(query))

if search_results['total']==0:
    print('No results found - Try another query')
else:
    df = pd.DataFrame(search_results['data']).dropna()

In [13]:
df.head()

Unnamed: 0,paperId,title,abstract,venue,year
0,9538df9a77549ca019b522eedfd5b367310293f7,Human intelligence and artificial intelligence...,This article acknowledges the profound transfo...,Journal on Innovation and Sustainability RISUS,2024
1,5d9cd911f6cefa87707bd5b360bd0f5b762b2749,Impact of Artificial Intelligence in the field...,Artificial intelligence (AI) has potentially t...,Journal of Physics: Conference Series,2021
2,2162d3b2588ff48b5046a6a9f144591800082ea6,AI-Driven Marketplaces and Price Prediction To...,This paper aims at identifying and analyzing t...,Babylonian Journal of Artificial Intelligence,2023
3,130b9d2ae352e9eacc6e4f048cfdc9ee277e9d46,Corporate Governance in the Digital Age: A Com...,This research examines the intersection of dis...,E3S Web of Conferences,2023
4,e8b18bdb2196fbfa50c4e56a8626a2e2bb168197,Empirically Understanding the Potential Impact...,"In the coming years, Artificial Intelligence (...",Proc. ACM Hum. Comput. Interact.,2024


### Re-ranking using SPECTER

In [14]:
# merge columns title and abstract into a string separated by tokenizer.sep_token and store it in a list

df['title_abs'] = [d['title'] + tokenizer.sep_token + (d.get('abstract') or '') for d in df.to_dict('records')]
df['n_tokens'] = df.title_abs.apply(lambda x: len(tokenizer.encode(x)))
df['title_abs'][2]

"AI-Driven Marketplaces and Price Prediction Tools for Rag Pickers: Enhancing Economic Opportunities in Africa's Circular Economy[SEP]This paper aims at identifying and analyzing the use of the AI solutions in enhancing the economic status of rag pickers within Africa’s circular economy sector and their access to markets. In this study, we explore how effectively the employment of AI technologies advances the welfare of the weaver people by identifying new potential methods of income generation and skill development in waste management techniques. Our research adopts a case study approach, examining the potential of two key AI applications: The resources include a marketplace that helps rag pickers find buyers for recyclable materials and price forecasting applications to aid selling. The role of such technologies in enhancing the efficiency of materials exchange, increasing or decreasing income, and improving rag pickers’ understanding of the market is discussed. The study entailed th

In [15]:
# get embeddings for each document and query

doc_embeddings = get_specter_embeddings(list(df['title_abs']))
query_embeddings = get_specter_embeddings(advanced_preprocess_query(query))

df['specter_embeddings'] = list(doc_embeddings)
df.head()

[+] processed query:  How rag stands retrievalaugmented generation work impacts artificial intelligence AI


Unnamed: 0,paperId,title,abstract,venue,year,title_abs,n_tokens,specter_embeddings
0,9538df9a77549ca019b522eedfd5b367310293f7,Human intelligence and artificial intelligence...,This article acknowledges the profound transfo...,Journal on Innovation and Sustainability RISUS,2024,Human intelligence and artificial intelligence...,226,"[0.21167019, 0.3828033, -0.8247395, 0.2124951,..."
1,5d9cd911f6cefa87707bd5b360bd0f5b762b2749,Impact of Artificial Intelligence in the field...,Artificial intelligence (AI) has potentially t...,Journal of Physics: Conference Series,2021,Impact of Artificial Intelligence in the field...,121,"[0.24786969, 0.29891622, -0.72270256, 0.294850..."
2,2162d3b2588ff48b5046a6a9f144591800082ea6,AI-Driven Marketplaces and Price Prediction To...,This paper aims at identifying and analyzing t...,Babylonian Journal of Artificial Intelligence,2023,AI-Driven Marketplaces and Price Prediction To...,324,"[0.0011444832, 0.22520088, -0.65623564, -0.066..."
3,130b9d2ae352e9eacc6e4f048cfdc9ee277e9d46,Corporate Governance in the Digital Age: A Com...,This research examines the intersection of dis...,E3S Web of Conferences,2023,Corporate Governance in the Digital Age: A Com...,275,"[0.38899112, -0.013424395, -0.8621957, 0.02648..."
4,e8b18bdb2196fbfa50c4e56a8626a2e2bb168197,Empirically Understanding the Potential Impact...,"In the coming years, Artificial Intelligence (...",Proc. ACM Hum. Comput. Interact.,2024,Empirically Understanding the Potential Impact...,306,"[0.21365033, 0.27276054, -0.9355176, 0.2673644..."


In [16]:
# find the cosine similarity between the query and the documents
df['similarity'] = cosine_similarity(query_embeddings, doc_embeddings).flatten()

# sort the dataframe by similarity
df.sort_values(by='similarity', ascending=False, inplace=True)
df.head()

Unnamed: 0,paperId,title,abstract,venue,year,title_abs,n_tokens,specter_embeddings,similarity
16,77179e5ff669452b9bea479a4236a6e2009ee422,The Power of Noise: Redefining Retrieval for R...,Retrieval-Augmented Generation (RAG) has recen...,Annual International ACM SIGIR Conference on R...,2024,The Power of Noise: Redefining Retrieval for R...,290,"[0.046463113, 0.43937993, -0.78377306, -0.2658...",0.963548
12,917cef7bf9841044af50f4bdd75db057aba1aed9,RAGProbe: An Automated Approach for Evaluating...,Retrieval Augmented Generation (RAG) is increa...,arXiv.org,2024,RAGProbe: An Automated Approach for Evaluating...,312,"[0.27762341, 0.38148627, -0.81774074, -0.27068...",0.95755
6,9cd1d535c50cc4ebba5db00c686547a2ee542e26,Imagine a More Ethical AI: Using Stories to De...,Artificial intelligence (AI) tools and technol...,Research in Equitable and Sustained Participat...,2021,Imagine a More Ethical AI: Using Stories to De...,236,"[0.29734066, 0.192899, -0.821154, -0.067754075...",0.937881
0,9538df9a77549ca019b522eedfd5b367310293f7,Human intelligence and artificial intelligence...,This article acknowledges the profound transfo...,Journal on Innovation and Sustainability RISUS,2024,Human intelligence and artificial intelligence...,226,"[0.21167019, 0.3828033, -0.8247395, 0.2124951,...",0.937511
10,dcbcba4cf01b36e499060a7d4e9a00e808f4daf9,ML Based Solutions for Greenhouse Gas Emission...,This literature review will serve as the basis...,International Journal on Perceptive and Cognit...,2023,ML Based Solutions for Greenhouse Gas Emission...,226,"[0.22443552, 0.2331391, -0.82182366, 0.1431329...",0.930784


## Get answer from GPT-3

In [17]:
answer_question(df, question=query, debug=False)


'Retrieval-Augmented Generation (RAG) is a method that extends the knowledge of Large Language Models by adding relevant passages or documents retrieved by an Information Retrieval (IR) system to the original prompt. RAG is crucial for Generative AI solutions in scenarios where knowledge is frequently updated and cannot be stored in the model. The retrieval component of RAG, whether dense or sparse, plays a significant role in enhancing the effectiveness of the Large Language Models. Research has shown that the type of passages retrieved by the IR system within RAG systems can greatly impact the performance of the model. Surprisingly, studies have found that even adding random documents in the prompt can improve the accuracy of the model. Understanding the appropriate strategies for integrating retrieval with Large Language Models is essential for the development and effectiveness of RAG systems and lays the groundwork for future research in this area.'