## Création d'un RAG en Local sur un ensemble de papier de recherche en Reinforcement Deep Learning 

OpenAI liste les principaux papiers du domaine sur cette page : https://spinningup.openai.com/en/latest/spinningup/keypapers.html
On rajoutera aussi le livre de Sutton et al.: http://incompleteideas.net/book/RLbook2020.pdf


## 1: Scrapping des pdf
### Récupération des liens

In [1]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

In [2]:
# Récupération des liens des pdfs du site
url = "https://spinningup.openai.com/en/latest/spinningup/keypapers.html"
response = requests.get(url)

pdf_links = []

if response.status_code == 200:
    soup = BeautifulSoup(response.text,'html.parser')
    links = [a['href'] for a in soup.find_all('a', class_='reference external') if 'href' in a.attrs]
    print("liens trouvés: ", links)
else:
    print("Erreur lors du téléchargement de la page")

# scrapping dans arkiv et openreview
for link in links:

    if link.startswith("https://arxiv.org/") or link.startswith("https://openreview"):
        a_class,pref = ("abs-button download-pdf","https://arxiv.org") if link.startswith("https://arxiv.org/") else ("note_content_pdf","https://openreview.net")
        r = requests.get(link)
        if r.status_code == 200:
            soup = BeautifulSoup(r.text,'html.parser')
            pdf_link = [a['href'] for a in soup.find_all('a', class_=a_class) if 'href' in a.attrs]
            print(pdf_link)
            pdf_links.append(pref+pdf_link[0])
        else:
            print("problem")

    elif  link.endswith(".pdf"):
        pdf_links.append(link)

pdf_links.append("http://incompleteideas.net/book/RLbook2020.pdf")

liens trouvés:  ['https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf', 'https://arxiv.org/abs/1507.06527', 'https://arxiv.org/abs/1511.06581', 'https://arxiv.org/abs/1509.06461', 'https://arxiv.org/abs/1511.05952', 'https://arxiv.org/abs/1710.02298', 'https://arxiv.org/abs/1602.01783', 'https://arxiv.org/abs/1502.05477', 'https://arxiv.org/abs/1506.02438', 'https://arxiv.org/abs/1707.06347', 'https://arxiv.org/abs/1707.02286', 'https://arxiv.org/abs/1708.05144', 'https://arxiv.org/abs/1611.01224', 'https://arxiv.org/abs/1801.01290', 'http://proceedings.mlr.press/v32/silver14.pdf', 'https://arxiv.org/abs/1509.02971', 'https://arxiv.org/abs/1802.09477', 'https://arxiv.org/abs/1707.06887', 'https://arxiv.org/abs/1710.10044', 'https://arxiv.org/abs/1806.06923', 'https://openreview.net/forum?id=ByG_3s09KX', 'https://github.com/google/dopamine', 'https://arxiv.org/abs/1611.02247', 'https://arxiv.org/abs/1710.11198', 'https://arxiv.org/abs/1802.10031', 'https://arxiv.org/abs/1702.08892', 'https:/

### Téléchargement des pdfs

In [3]:
import os
from tqdm import tqdm

In [5]:
if not os.path.isdir("data/"):
    os.mkdir("data/")

for (i,pdf_link) in tqdm(enumerate(pdf_links)):
    pdf_r = requests.get(pdf_link)
    if pdf_r.status_code == 200:
        path = "data/papier_"+str(i)+".pdf"
        if not os.path.isfile(path):
            with open(path,"wb") as f:
                f.write(pdf_r.content)
    else:
        print("probleme de téléchargement")
    

89it [01:34,  3.07s/it]

probleme de téléchargement


101it [01:49,  2.51s/it]

probleme de téléchargement


105it [03:06,  1.78s/it]


## 2: Importation des données

In [12]:
import pandas as pd

In [None]:
import fitz
import pandas as pd

data_path = "data/"
pdf_files = ["data/"+f for f in os.listdir(data_path) if f.endswith('.pdf')]
pdf_dic = {}

def text_formatter(text:str) -> str:
    cleaned_text = text.replace("\n"," ").strip()
    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    pages_and_text = []
    doc = fitz.open(pdf_path)

    for (page_number,page) in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text=text)
        pages_and_text.append({
            "page_number": page_number,
            "text": text
        })
    return pages_and_text

def open_pdfs(pdf_path_list : list[str]) -> list[dict]:
    pdf_and_text = []
    for (pdf_number,pdf_path) in tqdm(enumerate(pdf_path_list)):
        doc = fitz.open(pdf_path)
        texte_complet = "".join([page.get_text("text") for page in doc])
        texte_complet = text_formatter(texte_complet)
        pdf_and_text.append({
            "pdf_number":pdf_number,
            "pdf_page_count":doc.page_count,
            "pdf_char_count":len(texte_complet),
            "pdf_word_count":len(texte_complet.split(" ")),
            "pdf_sentence_count_raw":len(texte_complet.split(". ")),
            "pdf_token_count": len(texte_complet) /4
        })
    return pdf_and_text

pdf_and_text = open_pdfs(pdf_files)


In [14]:
df = pd.DataFrame(pdf_and_text)
df.describe().round(2)

Unnamed: 0,pdf_number,pdf_page_count,pdf_char_count,pdf_word_count,pdf_sentence_count_raw,pdf_token_count
count,103.0,103.0,103.0,103.0,103.0,103.0
mean,51.0,24.71,74430.4,12078.86,823.8,18607.6
std,29.88,57.37,154361.69,25545.04,2011.86,38590.42
min,0.0,7.0,20323.0,3693.0,77.0,5080.75
25%,25.5,13.0,45957.5,7121.0,418.5,11489.38
50%,51.0,15.0,52795.0,8519.0,481.0,13198.75
75%,76.5,19.0,61128.5,10021.5,577.5,15282.12
max,102.0,548.0,1555813.0,257192.0,18355.0,388953.25
