In [9]:
from langchain_community.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredHTMLLoader 
from langchain_openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.llms import HuggingFaceHub
from langchain.chains import LLMChain, ConversationChain
from langchain.prompts import PromptTemplate
from langchain.memory import ChatMessageHistory, ConversationBufferMemory,ConversationSummaryMemory

from langchain_community.llms import HuggingFaceEndpoint





In [10]:
import json

# Öffne die JSON-Datei und lade den Inhalt
with open('/Users/riccardo/Desktop/Repositorys_Github/LLM/Docs/api_token.json', 'r') as api_file:
    api_token_file = json.load(api_file)

# Extrahiere die Variable aus den Daten
api_token = api_token_file['Hugging_face_token']

In [11]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=50, 
    length_function = len)
text_splitter

<langchain_text_splitters.character.RecursiveCharacterTextSplitter at 0x128609510>

In [12]:
filepath = '/Users/riccardo/Desktop/Repositorys_Github/LLM/Docs/docs_for_llm_pipline/merged.pdf'
loader = PyPDFLoader(filepath)
chunks = loader.load_and_split(text_splitter=text_splitter)

In [13]:
for chunk in chunks:
    print("Page content: \n", chunk.page_content),
    print("Page_metadata: \n", chunk.metadata),
    print("----------------------------")

Page content: 
 Hochschule Osnabrück  Fakultät Wirtschaft- und Sozialwissenschaften Angewandte Volkswirtschaftslehre   Praktikumsbericht  Praxissemester bei der mso digital GmbH & Co. KG in Osnabrück in der Abteilung Data & Process Analytics        Semester:   Sommersemester 2023 Betreuer:    Prof. Dr. Peter Seppelfricke Verfasser:    Riccardo D’Andrea Matr. Nr.:    966697 Datum der Abgabe:         03.08.2023
Page_metadata: 
 {'source': '/Users/riccardo/Desktop/Repositorys_Github/LLM/Docs/docs_for_llm_pipline/merged.pdf', 'page': 0}
----------------------------
Page content: 
 I. Inhaltsverzeichnis II. Abkürzungsverzeichnis 1. Einleitung .................................................................................................................... 4 2. Vorstellung der mso digital GmbH & Co. KG ................................................................ 4 2.1 Das Unternehmen .......................................................................................................

In [14]:
embedding_function = SentenceTransformerEmbeddings(model_name="BAAI/bge-large-zh-v1.5")

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
embedding = embedding_function.embed_documents("This is a test sentence.")

print(embedding[0])
print("Dimension of Embedding: ", len(embedding[0]))

[0.030692150816321373, -0.031502220779657364, -0.017730996012687683, 0.035333432257175446, -0.028013229370117188, 0.04366171732544899, -0.017326941713690758, -0.02879144996404648, -0.018377207219600677, -0.03722976893186569, -0.006641303189098835, 0.03447132557630539, -0.022506963461637497, -0.0067087761126458645, -0.00989031232893467, -0.023188108578324318, -0.003284014528617263, 0.024761414155364037, 0.014999981969594955, -0.01783912256360054, -0.023429516702890396, 0.008491289801895618, -0.041361816227436066, 0.001655183150433004, 0.026237130165100098, -0.032837141305208206, -0.01661311462521553, 0.03826865926384926, -0.026053553447127342, -0.025660574436187744, 0.00018425437156111002, 0.057974353432655334, 0.012534989975392818, -0.026392772793769836, -0.021217787638306618, -0.004844049923121929, 0.016133053228259087, -0.007580542471259832, 0.028098614886403084, 0.0007906158571131527, 0.02106611058115959, -0.00990001205354929, 0.029389623552560806, -0.0029493358451873064, 0.00995842

In [16]:
db = Chroma.from_documents(chunks, embedding_function)

In [17]:
print("Chunks in DB:", db._collection.count())

Chunks in DB: 88


In [18]:
query = "Write a summary of the first page of the document."
retriever = db.as_retriever()
retriever.get_relevant_documents(query)

[Document(page_content='recurrent layers, by a factor of k. Separable convolutions [ 6], however, decrease the complexity\nconsiderably, to O(k·n·d+n·d2). Even with k=n, however, the complexity of a separable\nconvolution is equal to the combination of a self-attention layer and a point-wise feed-forward layer,\nthe approach we take in our model.\nAs side benefit, self-attention could yield more interpretable models. We inspect attention distributions\nfrom our models and present and discuss examples in the appendix. Not only do individual attention\nheads clearly learn to perform different tasks, many appear to exhibit behavior related to the syntactic\nand semantic structure of the sentences.\n5 Training\nThis section describes the training regime for our models.\n5.1 Training Data and Batching\nWe trained on the standard WMT 2014 English-German dataset consisting of about 4.5 million\nsentence pairs. Sentences were encoded using byte-pair encoding [ 3], which has a shared source-', 

In [19]:
llm = HuggingFaceEndpoint(repo_id='mistralai/Mistral-7B-Instruct-v0.2', 
                     huggingfacehub_api_token=api_token,  
                     model_kwargs={"max_length": 300})


Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/riccardo/.cache/huggingface/token
Login successful


In [20]:
qa_with_sources = RetrievalQAWithSourcesChain.from_chain_type(
    llm = llm,
    chain_type = "stuff",  
    retriever = retriever
    )

qa_with_sources



In [23]:
query = "Which name does the university has?"
qa_with_sources.invoke(query)

{'question': 'Which name does the university has?',
 'answer': ' The university is not named in the provided text.\n',
 'sources': ''}

## Data pipeline erstellen für die Dokumente 

In [7]:
import os

file_path = "/Users/riccardo/Desktop/Repositorys_Github/LLM/Docs/docs_for_llm_pipline"
def check_for_file_pipeline(file_path):
    if not os.path.exists(file_path):
        directory = "/Users/riccardo/Desktop/Repositorys_Github/LLM/Docs"
        file = "docs_for_llm_pipline"

        path = os.path.join(directory, file)
        os.mkdir(path)
        print("Directory '%s' created" % directory)
    else:
        print("File '%s' already exists" % file_path)
check_for_file_pipeline(file_path)



File '/Users/riccardo/Desktop/Repositorys_Github/LLM/Docs/docs_for_llm_pipline' already exists


In [8]:
from PyPDF2 import PdfMerger

file_path = "/Users/riccardo/Desktop/Repositorys_Github/LLM/Docs/docs_for_llm_pipline"
file_name = "docs_for_llm_pipline"

def merge_pdf(file_path, file_name, save_path):
    format = [".pdf"]
    pdfs = [f for f in os.listdir(file_path) if f.endswith(tuple(format))]

    if not pdfs:
        print("Keine PDF-Dateien im Verzeichnis gefunden.")
        return None
    
    merger = PdfMerger()
    for pdf in pdfs:
        with open(os.path.join(file_path, pdf), 'rb') as file:
            merger.append(file)

    merged_filename = os.path.join(save_path, file_name)
    with open(merged_filename, 'wb') as merged_file:
        merger.write(merged_file)

    print("PDFs erfolgreich zusammengeführt und gespeichert unter:", merged_filename)
    return merged_filename

merge_pdf("/Users/riccardo/Desktop/Repositorys_Github/LLM/Docs/", "merged.pdf", 
          "/Users/riccardo/Desktop/Repositorys_Github/LLM/Docs/docs_for_llm_pipline")



PDFs erfolgreich zusammengeführt und gespeichert unter: /Users/riccardo/Desktop/Repositorys_Github/LLM/Docs/docs_for_llm_pipline/merged.pdf


'/Users/riccardo/Desktop/Repositorys_Github/LLM/Docs/docs_for_llm_pipline/merged.pdf'