In [None]:
from langchain_community.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredHTMLLoader 
from langchain_openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.llms import HuggingFaceHub
from langchain.chains import LLMChain, ConversationChain
from langchain.prompts import PromptTemplate
from langchain.memory import ChatMessageHistory, ConversationBufferMemory,ConversationSummaryMemory

from langchain_community.llms import HuggingFaceEndpoint





In [None]:
import json

# Öffne die JSON-Datei und lade den Inhalt
with open('/Users/riccardo/Desktop/Repositorys_Github/LLM/Docs/api_token.json', 'r') as api_file:
    api_token_file = json.load(api_file)

# Extrahiere die Variable aus den Daten
api_token = api_token_file['Hugging_face_token']

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=50, 
    length_function = len)
text_splitter

In [None]:
filepath = '/Users/riccardo/Desktop/Repositorys_Github/LLM/Docs/docs_for_llm_pipline/merged.pdf'
loader = PyPDFLoader(filepath)
chunks = loader.load_and_split(text_splitter=text_splitter)

In [None]:
for chunk in chunks:
    print("Page content: \n", chunk.page_content),
    print("Page_metadata: \n", chunk.metadata),
    print("----------------------------")

In [None]:
embedding_function = SentenceTransformerEmbeddings(model_name="BAAI/bge-large-zh-v1.5")

In [None]:
embedding = embedding_function.embed_documents("This is a test sentence.")

print(embedding[0])
print("Dimension of Embedding: ", len(embedding[0]))

In [None]:
db = Chroma.from_documents(chunks, embedding_function)

In [None]:
print("Chunks in DB:", db._collection.count())

In [None]:
query = "Write a summary of the first page of the document."
retriever = db.as_retriever()
retriever.get_relevant_documents(query)

In [None]:
llm = HuggingFaceEndpoint(repo_id='mistralai/Mistral-7B-Instruct-v0.2', 
                     huggingfacehub_api_token=api_token,  
                     model_kwargs={"max_length": 300})


In [None]:
qa_with_sources = RetrievalQAWithSourcesChain.from_chain_type(
    llm = llm,
    chain_type = "stuff",  
    retriever = retriever
    )

qa_with_sources

In [None]:
query = "Which name does the university has?"
qa_with_sources.invoke(query)

## Data pipeline erstellen für die Dokumente 

In [None]:
import os

file_path = "/Users/riccardo/Desktop/Repositorys_Github/LLM/Docs/docs_for_llm_pipline"
def check_for_file_pipeline(file_path):
    if not os.path.exists(file_path):
        directory = "/Users/riccardo/Desktop/Repositorys_Github/LLM/Docs"
        file = "docs_for_llm_pipline"

        path = os.path.join(directory, file)
        os.mkdir(path)
        print("Directory '%s' created" % directory)
    else:
        print("File '%s' already exists" % file_path)
check_for_file_pipeline(file_path)



In [None]:
from PyPDF2 import PdfMerger

file_path = "/Users/riccardo/Desktop/Repositorys_Github/LLM/Docs/docs_for_llm_pipline"
file_name = "docs_for_llm_pipline"

def merge_pdf(file_path, file_name, save_path):
    format = [".pdf"]
    pdfs = [f for f in os.listdir(file_path) if f.endswith(tuple(format))]

    if not pdfs:
        print("Keine PDF-Dateien im Verzeichnis gefunden.")
        return None
    
    merger = PdfMerger()
    for pdf in pdfs:
        with open(os.path.join(file_path, pdf), 'rb') as file:
            merger.append(file)

    merged_filename = os.path.join(save_path, file_name)
    with open(merged_filename, 'wb') as merged_file:
        merger.write(merged_file)

    print("PDFs erfolgreich zusammengeführt und gespeichert unter:", merged_filename)
    return merged_filename

merge_pdf("/Users/riccardo/Desktop/Repositorys_Github/LLM/Docs/", "merged.pdf", 
          "/Users/riccardo/Desktop/Repositorys_Github/LLM/Docs/docs_for_llm_pipline")



# ----------------------------------------------

# Webbase loader

In [None]:
import requests
from bs4 import BeautifulSoup
from datetime import date


class WebBaseLoader:
    def __init__(self, url):
        self.url = url

    def load(self):
        response = requests.get(self.url)
        if response.status_code == 200:
            return response.text
        else:
            return None

    def get_last_paper_link(self):
        html_content = self.load()
        if html_content:
            soup = BeautifulSoup(html_content, 'html.parser')
            paper_links = soup.find_all('a', href=True)
            # Extract only the links that point to PDF files
            pdf_links = [link['href'] for link in paper_links if link['href'].endswith('.pdf')]
            # The last paper should be at the end of the list
            if pdf_links:
                return pdf_links[0]  # get the last paper link
            else:
                return None
        else:
            return None
        
    def download_pdf(self, pdf_link, save_path): 
        response = requests.get(pdf_link)
        if response.status_code == 200:
            with open(save_path, 'wb') as file:
                file.write(response.content)
            print("PDF erfolgreich heruntergeladen und gespeichert unter:", save_path)
        else:
            print("Fehler beim Herunterladen der PDF-Datei.")

# Beispiel-Nutzung:
loader = WebBaseLoader("https://www.jmlr.org/")
last_paper_link = loader.get_last_paper_link()  # Get the last paper link
today = date.today()

save_path = f"/Users/riccardo/Desktop/Repositorys_Github/LLM/Docs/last_paper_{today}.pdf"

if last_paper_link:
    print("Letztes Paper-Link:",' https://www.jmlr.org' + last_paper_link) 
    loader.download_pdf('https://www.jmlr.org' + last_paper_link, save_path)  # Download PDF
else:
    print("Fehler beim Laden der Webseite oder keine PDF-Papiere gefunden.")


In [None]:
from PyPDF2 import PdfMerger
import os

file_path = "/Users/riccardo/Desktop/Repositorys_Github/LLM/Docs/docs_for_llm_pipline"
file_name = "docs_for_llm_pipline"

def merge_pdf(file_path, file_name, save_path):
    format = [".pdf"]
    pdfs = [f for f in os.listdir(file_path) if f.endswith(tuple(format))]

    if not pdfs:
        print("Keine PDF-Dateien im Verzeichnis gefunden.")
        return None
    
    merger = PdfMerger()
    for pdf in pdfs:
        with open(os.path.join(file_path, pdf), 'rb') as file:
            merger.append(file)

    merged_filename = os.path.join(save_path, file_name)
    with open(merged_filename, 'wb') as merged_file:
        merger.write(merged_file)

    print("PDFs erfolgreich zusammengeführt und gespeichert unter:", merged_filename)
    return merged_filename

merge_pdf("/Users/riccardo/Desktop/Repositorys_Github/LLM/Docs/", "merged.pdf", 
          "/Users/riccardo/Desktop/Repositorys_Github/LLM/Docs/docs_for_llm_pipline")



In [None]:
# Importe
import requests
from bs4 import BeautifulSoup
from datetime import date
import PyPDF2

# Klasse WebBaseLoader
class WebBaseLoader:
    def __init__(self, url):
        self.url = url

    def load(self):
        response = requests.get(self.url)
        if response.status_code == 200:
            return response.text
        else:
            return None

    def get_last_paper_link(self):
        html_content = self.load()
        if html_content:
            soup = BeautifulSoup(html_content, 'html.parser')
            paper_links = soup.find_all('a', href=True)
            # Extrahiere nur die Links, die auf PDF-Dateien verweisen
            pdf_links = [link['href'] for link in paper_links if link['href'].endswith('.pdf')]
            # Das letzte Paper sollte am Ende der Liste stehen
            if pdf_links:
                return pdf_links[0]  # get the last paper link
            else:
                return None
        else:
            return None
        
    def download_pdf(self, pdf_link, save_path): 
        response = requests.get(pdf_link)
        if response.status_code == 200:
            with open(save_path, 'wb') as file:
                file.write(response.content)
            print("PDF erfolgreich heruntergeladen und gespeichert unter:", save_path)
            # Überprüfen, ob das heruntergeladene PDF sowohl "Abstract" als auch "Conclusion" enthält
            relevant_pages = self.extract_pages_with_keywords(save_path, ["Abstract"])
            if relevant_pages:
                print("Das PDF enthält sowohl Abstract als auch Conclusion.")
            else:
                print("Das PDF enthält nicht sowohl Abstract als auch Conclusion.")
        else:
            print("Fehler beim Herunterladen der PDF-Datei.")

    def extract_pages_with_keywords(self, pdf_file, keywords):
        extracted_pages = []
        with open(pdf_file, 'rb') as file:
            reader = PyPDF2.PdfFileReader(file)
            for page_num in range(reader.numPages):
                page = reader.getPage(page_num)
                text = page.extractText()
                if all(keyword.lower() in text.lower() for keyword in keywords):
                    extracted_pages.append(page_num + 1)  # Pages are 0-indexed, so add 1
        return extracted_pages

# Beispiel-Nutzung
loader = WebBaseLoader("https://www.jmlr.org/")
last_paper_link = loader.get_last_paper_link()  # Get the last paper link
today = date.today()

save_path = f"/Users/riccardo/Desktop/Repositorys_Github/LLM/Docs/last_paper_{today}.pdf"

if last_paper_link:
    print("Letztes Paper-Link:", 'https://www.jmlr.org' + last_paper_link) 
    loader.download_pdf('https://www.jmlr.org' + last_paper_link, save_path)  # Download PDF
else:
    print("Fehler beim Laden der Webseite oder keine PDF-Papiere gefunden.")


In [None]:
import requests
from bs4 import BeautifulSoup
from datetime import date
import PyPDF2
from io import BytesIO
import time

class WebBaseLoader:
    def __init__(self, url):
        self.url = url

    def load(self):
        response = requests.get(self.url)
        if response.status_code == 200:
            return response.text
        else:
            return None

    def get_last_paper_link(self):
        html_content = self.load()
        if html_content:
            soup = BeautifulSoup(html_content, 'html.parser')
            paper_links = soup.find_all('a', href=True)
            pdf_links = [link['href'] for link in paper_links if link['href'].endswith('.pdf')]
            if pdf_links:
                return pdf_links[0]
            else:
                return None
        else:
            return None
        
    def download_pdf(self, pdf_link, save_path): 
        response = requests.get(pdf_link)
        if response.status_code == 200:
            # Überprüfen, ob das heruntergeladene PDF sowohl "Abstract" als auch "Conclusion" enthält
            relevant_pages = self.extract_pages_with_keywords(response.content, ["Abstract", "Conclusion"])
            if relevant_pages:
                with open(save_path, 'wb') as file:
                    writer = PyPDF2.PdfFileWriter()
                    reader = PyPDF2.PdfFileReader(BytesIO(response.content))
                    for page_num in relevant_pages:
                        writer.addPage(reader.getPage(page_num))
                    writer.write(file)
                print("PDF erfolgreich heruntergeladen und gespeichert unter:", save_path)
            else:
                print("Das PDF enthält weder Abstract noch Conclusion.")
        else:
            print("Fehler beim Herunterladen der PDF-Datei.")

    def extract_pages_with_keywords(self, pdf_content, keywords):
        extracted_pages = []
        reader = PyPDF2.PdfFileReader(BytesIO(pdf_content))
        for page_num in range(reader.numPages):
            page = reader.getPage(page_num)
            text = page.extractText()
            if any(keyword.lower() in text.lower() for keyword in keywords):
                extracted_pages.append(page_num)
        return extracted_pages

loader = WebBaseLoader("https://www.jmlr.org/")
last_paper_link = loader.get_last_paper_link()  
today = date.today()
current_time = time.strftime("%H_%M_%S")
save_path = f"last_paper_{today,current_time}.pdf"

if last_paper_link:
    print("Letztes Paper-Link:", 'https://www.jmlr.org' + last_paper_link) 
    loader.download_pdf('https://www.jmlr.org' + last_paper_link, save_path)  
else:
    print("Fehler beim Laden der Webseite oder keine PDF-Papiere gefunden.")


# get only txt of artificile paper

In [10]:
response = requests.get('https://www.jmlr.org/')
if response.status_code == 200:
    text = response.text

text



# html_content = self.load()
#         if html_content:
#             soup = BeautifulSoup(html_content, 'html.parser')
#             paper_links = soup.find_all('a', href=True)
#             pdf_links = [link['href'] for link in paper_links if link['href'].endswith('.pdf')]
#             if pdf_links:
#                 return pdf_links[0]

'<html>\n<head>\n    <!-- Global site tag (gtag.js) - Google Analytics -->\n    <script async src="https://www.googletagmanager.com/gtag/js?id=UA-131826476-1"></script>\n    <script>\n    window.dataLayer = window.dataLayer || [];\n    function gtag(){dataLayer.push(arguments);}\n    gtag(\'js\', new Date());\n\n    gtag(\'config\', \'UA-131826476-1\');\n    </script>\n\n    <meta http-equiv="Content-type" content="text/html;charset=UTF-8">\n\n  <!-- favicon -->\n  <link rel="icon" href="/img/favicon.ico">\n  <link rel="icon" type="image/png" href="/img/favicon-16x16.png">\n  <link rel="icon" type="image/png" href="/img/favicon-32x32.png">\n\n<title>Journal of Machine Learning Research</title>\n\n\n<link rel="alternate" type="application/rss+xml" href="/jmlr.xml" title="JMLR RSS">\n<link rel="stylesheet" type="text/css" href="/style.css">\n<style type="text/css">\n. {font-family:verdana,helvetica,sans-serif}\na {text-decoration:none;color:#3030a0}\n\n#fixed {\n    position: absolute;\n