## Création d'un RAG en Local sur un ensemble de papier de recherche en Reinforcement Deep Learning 

OpenAI liste les principaux papiers du domaine sur cette page : https://spinningup.openai.com/en/latest/spinningup/keypapers.html
On rajoutera aussi le livre de Sutton et al.: http://incompleteideas.net/book/RLbook2020.pdf


## 1: Scrapping des pdf
### Récupération des liens

In [4]:
import requests
from bs4 import BeautifulSoup
import tqdm as tq

In [5]:
# Récupération des liens des pdfs du site
url = "https://spinningup.openai.com/en/latest/spinningup/keypapers.html"
response = requests.get(url)

pdf_links = []

if response.status_code == 200:
    soup = BeautifulSoup(response.text,'html.parser')
    links = [a['href'] for a in soup.find_all('a', class_='reference external') if 'href' in a.attrs]
    print("liens trouvés: ", links)
else:
    print("Erreur lors du téléchargement de la page")

# scrapping dans arkiv et openreview
for link in links:

    if link.startswith("https://arxiv.org/") or link.startswith("https://openreview"):
        a_class,pref = ("abs-button download-pdf","https://arxiv.org") if link.startswith("https://arxiv.org/") else ("note_content_pdf","https://openreview.net")
        r = requests.get(link)
        if r.status_code == 200:
            soup = BeautifulSoup(r.text,'html.parser')
            pdf_link = [a['href'] for a in soup.find_all('a', class_=a_class) if 'href' in a.attrs]
            pdf_links.append(pref+pdf_link[0])
        else:
            print("problem")

    elif  link.endswith(".pdf"):
        pdf_links.append(link)

pdf_links.append("http://incompleteideas.net/book/RLbook2020.pdf")

liens trouvés:  ['https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf', 'https://arxiv.org/abs/1507.06527', 'https://arxiv.org/abs/1511.06581', 'https://arxiv.org/abs/1509.06461', 'https://arxiv.org/abs/1511.05952', 'https://arxiv.org/abs/1710.02298', 'https://arxiv.org/abs/1602.01783', 'https://arxiv.org/abs/1502.05477', 'https://arxiv.org/abs/1506.02438', 'https://arxiv.org/abs/1707.06347', 'https://arxiv.org/abs/1707.02286', 'https://arxiv.org/abs/1708.05144', 'https://arxiv.org/abs/1611.01224', 'https://arxiv.org/abs/1801.01290', 'http://proceedings.mlr.press/v32/silver14.pdf', 'https://arxiv.org/abs/1509.02971', 'https://arxiv.org/abs/1802.09477', 'https://arxiv.org/abs/1707.06887', 'https://arxiv.org/abs/1710.10044', 'https://arxiv.org/abs/1806.06923', 'https://openreview.net/forum?id=ByG_3s09KX', 'https://github.com/google/dopamine', 'https://arxiv.org/abs/1611.02247', 'https://arxiv.org/abs/1710.11198', 'https://arxiv.org/abs/1802.10031', 'https://arxiv.org/abs/1702.08892', 'https:/

### Téléchargement des pdfs

In [6]:
import os
import tqdm 

In [7]:
if not os.path.isdir("data/"):
    os.mkdir("data/")

pdf_links = pdf_links

for (i,pdf_link) in tqdm(enumerate(pdf_links)):
    path = "data/papier_"+str(i)+ ".pdf"
    
    if not os.path.isfile(path):
        pdf_r = requests.get(pdf_link)
        if pdf_r.status_code == 200:
            with open(path,"wb") as f:
                f.write(pdf_r.content)
        else:
            print("probleme de téléchargement")
    else:
        pass
    

TypeError: 'module' object is not callable

## 2: Importation des données

In [None]:
import pandas as pd

In [None]:
from spacy.lang.en import English
def text_to_sentences(text):
    nlp = English()
    nlp.add_pipe("sentencizer")
    doc = nlp(text)

    sentences = [str(s) for s in list(doc.sents)]
    return sentences

def sentence_list_to_chunk(nb_sentence :  int,s_list : list[str]):
    chunks = [" ".join(s_list[i:i+nb_sentence]) for i in range(0,len(s_list),nb_sentence)]
    return chunks




In [None]:
import fitz
import pandas as pd
from tqdm.notebook import tqdm as tqn

data_path = "data/"
pdf_files = ["data/"+f for f in os.listdir(data_path) if f.endswith('.pdf')]
pdf_dic = {}

def text_formatter(text:str) -> str:
    cleaned_text = text.replace("\n"," ").strip()
    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    pages_and_text = []
    doc = fitz.open(pdf_path)

    for (page_number,page) in enumerate(doc):
        text = page.get_text()
        text = text_formatter(text=text)
        sentences = text_to_sentences(text)
        chunks = sentence_list_to_chunk(10,sentences)
        pages_and_text.append({
            "page_number": page_number,
            "text": text,
            "chunks": chunks,
            "page_char_count": len(text),
            "page_words_count": len(text.split(" ")),
            "page_sentences_count": len(text.split(". ")),
            "page_token_count": len(text)/4,
            "page_chunks_count": len(chunks)
        })
    return pages_and_text

def open_pdfs(pdf_path_list : list[str]) -> list[dict]:
    pdf_and_text = []
    pdfs_pages_text_list = []
    for (pdf_number,pdf_path) in tqn(enumerate(pdf_path_list),total=len(pdf_path_list)):

        pages_and_text = open_and_read_pdf(pdf_path)
        pdfs_pages_text_list.append(pages_and_text)
        df = pd.DataFrame(pages_and_text)
        dict_mean = df.describe().round(2).loc['mean'].to_dict()

        doc = fitz.open(pdf_path)
        texte_complet = "".join([page.get_text("text") for page in doc])
        texte_complet = text_formatter(texte_complet)
        pdf_and_text.append({
            "pdf_number":pdf_number,
            "pdf_page_count":doc.page_count,
            "pdf_char_count":len(texte_complet),
            "pdf_word_count":len(texte_complet.split(" ")),
            "pdf_sentence_count_raw":len(texte_complet.split(". ")),
            "pdf_token_count": len(texte_complet) /4,
            "page_mean_char_count": dict_mean["page_char_count"],
            "page_mean_words_count": dict_mean["page_words_count"],
            "page_mean_sentences_count": dict_mean["page_sentences_count"],
            "page_mean_token_count": dict_mean["page_token_count"]
        })
    return pdf_and_text,pdfs_pages_text_list




## Création des chunks de phrases. 
On parcours les toutes les pages des pdfs. Pour chaque page, on prends tout le texte on le sépare en phrases qu'on met dans une liste, on crée ensuite une liste donc chaque élément est une liste de 10 phrases. On met ca dans un dictionnaire avec les données suivante: le nom du papier, le numéro de page absolue du papier, le lien du papier (local). On fait une liste du dictionnaire pour toutes les pages d'un papier. On rajoute dans cette liste tout les autres pdfs. Ca nous donne un gros dataframe avec un ensemble de listes de phrases. Apprès on prend ces chunks on les join et on crée u...

In [None]:
from tqdm.notebook import tqdm

def pdfs_to_chunks(data_path):
    pdf_files = [data_path+f for f in os.listdir(data_path) if f.endswith('.pdf')]
    chunks = []
    pdf_and_text,pdfs_pages_text_list = open_pdfs(pdf_files)
    
    for (pdf_num,pdf) in enumerate(pdfs_pages_text_list):
        for page in pdf:
            for chunk in page["chunks"]:
                chk = {}
                chk["pdf_number"] = pdf_num
                chk["page_number"] = page["page_number"]
                chk["text"] = chunk
                chk["chunk_count"] = len(chk["text"])
                chk["chunk_token_count"] = len(chk["text"])/4

                chunks.append(chk)

    return chunks


    

In [None]:
csvpath = "data/text_chunks_and_embeddings.csv"
if not os.path.isfile(csvpath):
        
    chunks_list = pdfs_to_chunks("data/")
    df = pd.DataFrame(chunks_list)
    df.describe().round(2)

In [None]:

# df = pd.DataFrame(chunks_list)
# df.describe().round(2)
# df = df[ df["chunk_token_count"] >= 20]
# len(df)
# df = df[df["chunk_token_count"] <= 600]
# df.sample(5)

Unnamed: 0,pdf_number,page_number,text,chunk_count,chunk_token_count
1658,6,505,"Bellemare, M. G., Dabney, W., Munos, R. (2017)...",516,129.0
1793,6,522,"References 501 Mendel, J. M., McLaren, R. W. (...",373,93.25
3996,45,8,6] Dart: Dynamic animation and robotics toolki...,387,96.75
6451,86,4,1. Human Oversight Phase (duration = 4.5 hours...,579,144.75
5617,73,4,"Note, to reduce the number of actors needed pe...",1119,279.75


In [None]:

from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-mpnet-base-v2",device = "cpu")

if not os.path.isfile(csvpath):
    for chunk in tqn(chunks_list,total=len(chunks_list)):
        embeding = model.encode(chunk["text"])
        chunk["embedding"] = embeding

In [None]:
# df = pd.DataFrame(chunks_list)
# df.describe().round(2)
# df = df[ df["chunk_token_count"] >= 20]
# len(df)
# df = df[df["chunk_token_count"] <= 600]
# df.sample(5)

# df.to_csv(csvpath,index = False)

In [None]:
import numpy as np

csv_to_chunks = pd.read_csv(csvpath)
csv_to_chunks.head()
csv_to_chunks["embedding"] = csv_to_chunks["embedding"].apply(lambda x: np.fromstring(x.strip('[]'),sep = " "))


Unnamed: 0,pdf_number,page_number,chunk_count,chunk_token_count
count,7543.0,7543.0,7543.0,7543.0
mean,44.93,76.01,979.08,244.77
std,33.11,141.88,512.26,128.07
min,0.0,0.0,80.0,20.0
25%,6.0,5.0,535.0,133.75
50%,43.0,10.0,973.0,243.25
75%,77.0,50.5,1364.0,341.0
max,102.0,547.0,2399.0,599.75


In [None]:
import torch


csv_to_chunks_list_of_dict = csv_to_chunks.to_dict(orient="records")
embedings = torch.tensor(np.stack(csv_to_chunks["embedding"].to_list(),axis=0),dtype=torch.float32)
embedings

tensor([[-0.0194,  0.0533, -0.0432,  ...,  0.0508,  0.0025, -0.0156],
        [ 0.0036,  0.0735, -0.0157,  ...,  0.0680,  0.0357, -0.0103],
        [-0.0025,  0.0549, -0.0376,  ...,  0.0487,  0.0173,  0.0063],
        ...,
        [ 0.0225,  0.0068, -0.0008,  ...,  0.0619,  0.0459, -0.0025],
        [-0.0409, -0.0417, -0.0394,  ...,  0.0116,  0.0503, -0.0091],
        [-0.0090, -0.0296, -0.0588,  ..., -0.0040,  0.0070, -0.0092]])

In [None]:
csv_to_chunks_list_of_dict[395]

{'pdf_number': 6,
 'page_number': 1,
 'text': 'Adaptive Computation and Machine Learning Francis Bach, series editor A complete list of books published in the Adaptive Computation and Machine Learning series appears at the back of this book.',
 'chunk_count': 194,
 'chunk_token_count': 48.5,
 'embedding': array([-1.01828668e-02,  1.02995662e-02, -6.62129596e-02, -3.35998344e-03,
        -2.96776053e-02, -3.55895027e-04,  3.03983614e-02,  9.73482803e-03,
         6.97616430e-04,  1.94171425e-02,  4.28065844e-02,  2.64051128e-02,
        -1.09434798e-02,  3.73748802e-02, -9.43878479e-03, -4.01231162e-02,
         1.82673577e-02,  2.11566910e-02, -8.99788272e-03, -1.79039370e-02,
        -2.52534915e-02, -4.76417430e-02,  3.14446003e-03,  4.36046124e-02,
         1.38526810e-02,  2.32962482e-02, -6.87500788e-03, -2.14416254e-02,
        -1.26709277e-03,  2.57759206e-02,  5.89846959e-03, -1.12615451e-02,
        -1.24668106e-02,  3.97212617e-02,  1.55406190e-06, -3.18033881e-02,
        -7

In [None]:


query = "What is Q learning"
query_embeding = model.encode(query,convert_to_tensor=True)


from time import perf_counter as timer

from sentence_transformers import util

st = timer()
dot_scores = model.similarity(a=query_embeding,b=embedings)
et = timer()

print(f"[INFO] Ca a mis {et-st:.5f} pour faire le dot product sur {len(embedings)} embedings sur cpu")


[INFO] Ca a mis 0.00837 pour faire le dot product sur 7543 embedings sur cpu


In [None]:
tp_k = torch.topk(dot_scores,k=5)
csv_to_chunks_list_of_dict[31]
tp_k.values[0]

tensor([0.7080, 0.7076, 0.7052, 0.7008, 0.6953])

In [None]:
import time


def query_to_topk_chunks(query,chunks_text_dict_list,embedings,model,print_info:bool = True):
    query_embeding = model.encode(query)
    st = timer()
    similarity = model.similarity(query_embeding,embedings)
    ft = timer()

    if print_info:
        print(f"Le temps pour calculer {len(embedings)} est de {ft-st:.5f}")

    topk = torch.topk(similarity,k=5)
    context = []
    for (i,indice) in enumerate(topk.indices[0]):
        chunk = {
            "score": topk.values[0][i],
            "text": chunks_text_dict_list[indice]["text"],
            "pdf_number": chunks_text_dict_list[indice]["pdf_number"],
            "page_number": chunks_text_dict_list[indice]["page_number"]
        }
        context.append(chunk)
    
    return context



In [None]:
context = query_to_topk_chunks(query,csv_to_chunks_list_of_dict,embedings,model)
contexte_df = pd.DataFrame(context)
contexte_df

Le temps pour calculer 7543 est de 0.00879


Unnamed: 0,score,text,pdf_number,page_number
0,tensor(0.7080),440 Chapter 16: Applications and Case Studies ...,6,461
1,tensor(0.7076),Double q-learning. In Advances in Neural Infor...,18,9
2,tensor(0.7052),Algorithm 12 The function implementing the tab...,5,56
3,tensor(0.7008),Algorithm 13 The function implementing the Q-l...,5,58
4,tensor(0.6953),"the loss of information than does DQN. Thus, r...",1,1
