In [1]:
%pip install --upgrade pip
%pip install langchain
%pip install llama-cpp-python==

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


Hier wird eine Datenbank erzeugt und gespeichert.

In [1]:
from llama_index import SimpleDirectoryReader
from llama_index.node_parser import SentenceSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from llama_index.schema import TextNode
from langchain_community.vectorstores.chroma import Chroma
from langchain_community.embeddings import LlamaCppEmbeddings
from llama_index.llms import LlamaCPP

data_path = "/home/tpllmws23/llms/main_data"
database_path = "/home/tpllmws23/Chatbot-LLama-Pruefungsamt/Chatbot-Jan/chromadb_lanchain_rag_auto_embeddings/"
database_collection = "Pruefungsamt"
model_path = "/home/tpllmws23/llms/llama-2-13b-chat.Q4_K_M.gguf"

print("##########################")
print("### Load directory     ###")
print("##########################")
raw_documents = SimpleDirectoryReader(data_path).load_data()

print("##############################")
print("### Load llm and embedding ###")
print("##############################")

""" llm = LlamaCPP(model_path=model_path,
        context_window=4096,
        verbose=False,
        temperature=0.0,
)
 """
embedding = LlamaCppEmbeddings(model_path=model_path, n_ctx=1024)

print("#################################")
print("### Split docs into sentences ###")
print("#################################")

text_parser = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )

text_chunks = []
# maintain relationship with source doc index, to help inject doc metadata in (3)
doc_idxs = []
for doc_idx, page in enumerate(raw_documents):
    page_text = page.get_text()
    cur_text_chunks = text_parser.split_text(page_text)
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))


nodes = []
for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
    )
    src_doc_idx = doc_idxs[idx]
    src_page = raw_documents[src_doc_idx]
    nodes.append(node)

for i, node in enumerate(nodes):
    try:
        node_embedding = embedding.embed_query(node.text)
        node.embedding = node_embedding
    except Exception as e:
        print(f"Failed to embed node {i}: {e}")

for idx, node in enumerate(nodes):
    node.metadata = raw_documents[doc_idxs[idx]].metadata


embeddings = []
documents = []
metadatas = []
ids = []

for node in nodes:
    embeddings.append(node.embedding)
    documents.append(node.text)
    metadatas.append(node.metadata)
    ids.append(node.id_)


print("##########################")
print("### Store in database  ###")
print("##########################")

import chromadb

client = chromadb.PersistentClient(path=database_path)

collection = client.get_or_create_collection(database_collection)
collection.add(ids=ids, embeddings=embeddings, documents=documents, metadatas=metadatas)

##########################
### Load directory     ###
##########################
##############################
### Load llm and embedding ###
##############################


ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no
ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
ggml_init_cublas: found 1 CUDA devices:
  Device 0: NVIDIA TITAN RTX, compute capability 7.5, VMM: yes
llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from /home/tpllmws23/llms/llama-2-13b-chat.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
llama_model_loader: - kv   4:                          llama.block_count u32              = 40
llama_model_loader: - kv   5:                  llama.feed_forward_length

#################################
### Split docs into sentences ###
#################################



llama_print_timings:        load time =     515.20 ms
llama_print_timings:      sample time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =    6106.28 ms /    92 tokens (   66.37 ms per token,    15.07 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =    6126.50 ms

llama_print_timings:        load time =     515.20 ms
llama_print_timings:      sample time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =    4129.58 ms /    63 tokens (   65.55 ms per token,    15.26 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =    4142.00 ms

llama_print_timings:        load time =     515.20 ms
l

##########################
### Store in database  ###
##########################



llama_print_timings:        load time =     515.20 ms
llama_print_timings:      sample time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =    8618.69 ms /   131 tokens (   65.79 ms per token,    15.20 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =    8645.13 ms


In [1]:
from langchain.llms.llamacpp import LlamaCpp
from langchain_community.vectorstores.chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

database_path = "/home/tpllmws23/Chatbot-LLama-Pruefungsamt/Chatbot-Jan/chromadb_lanchain_rag_auto_embeddings/"
database_collection = "Pruefungsamt"
model_path = "/home/tpllmws23/llms/llama-2-13b-chat.Q4_K_M.gguf"

n_ctx = 4096

callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

llm = LlamaCpp(model_path=model_path,
        n_gpu_layers=-1,
        n_batch=512,
        n_ctx=n_ctx,
        f16_kv=True,
        verbose=False,
        temperature=0.0,
        top_p=1,
        callback_manager=callback_manager,
        return_full_text=False
)

from langchain_community.embeddings import LlamaCppEmbeddings
embedding = LlamaCppEmbeddings(model_path=model_path,
        n_gpu_layers=-1,
        n_batch=512,
        n_ctx=n_ctx,
        f16_kv=True
        )

db = Chroma(
    persist_directory=database_path, 
    collection_name=database_collection, 
    embedding_function=embedding
)
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 10})


                return_full_text was transferred to model_kwargs.
                Please confirm that return_full_text is what you intended.
ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no
ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
ggml_init_cublas: found 1 CUDA devices:
  Device 0: NVIDIA TITAN RTX, compute capability 7.5, VMM: yes
llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from /home/tpllmws23/llms/llama-2-13b-chat.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
llama_model_loader: - kv  

In [7]:
db.similarity_search("What are the requirements for studying MSI in masters degree at the HTWG?", k=8)


llama_print_timings:        load time =      85.62 ms
llama_print_timings:      sample time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =     373.31 ms /    19 tokens (   19.65 ms per token,    50.90 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =     375.62 ms


[Document(page_content='•Cloud Application  Development (WS)\n•Concurrency  (SS) \n•Data Analysis (SS) \n•IT-Consulting (WS)\n•IT-Leadership: Entrepreneurship (SS) + IoT \n(SS)\n•Konzepte aktueller Datenbanksysteme (SS)\n•Mobile Kommunikation und Kollaboration (WS)\n•Quantum Computing (SS)\n•Reactive  Systems (WS)', metadata={'creation_date': '2023-11-20', 'file_name': 'Infoveranstaltung_Masterstudiengaenge-Informatik_HTWG-Konstanz.pdf', 'file_path': '/home/tpllmws23/llms/main_data/Infoveranstaltung_Masterstudiengaenge-Informatik_HTWG-Konstanz.pdf', 'file_size': 212749, 'file_type': 'application/pdf', 'last_accessed_date': '2024-02-14', 'last_modified_date': '2023-11-20', 'page_label': '12'}),
 Document(page_content='▪Prüfungsausschussvorsitz \nProf. Dr. Alexander Hoffmann ( alexander.hoffmann@htwg -konstanz.de )\n▪Auslandssemester                                                                                     \nProf. Dr. Marco Mevius  (mmevius@htwg -konstanz.de )', metadata={'crea

In [4]:
from operator import itemgetter
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
from langchain.chains import LLMChain
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain.chains import create_retrieval_chain
from langchain.globals import set_debug
from langchain.chains.combine_documents import create_stuff_documents_chain


retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 10})
set_debug(True)

template = """Answer the following question based only on the provided context:
{context}

Question: {input}"""

prompt = ChatPromptTemplate.from_template(template)

document_chain = create_stuff_documents_chain(llm, prompt)
retrieval_chain = create_retrieval_chain(retriever, document_chain)

chain = (
    {"context": retriever, "input": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)




In [6]:
response = retrieval_chain.invoke({"input": "What are the requirements for studying MSI in masters degree at the HTWG?"})

[32;1m[1;3m[chain/start][0m [1m[1:chain:retrieval_chain] Entering Chain run with input:
[0m{
  "input": "What are the requirements for studying MSI in masters degree at the HTWG?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:retrieval_chain > 2:chain:RunnableAssign<context>] Entering Chain run with input:
[0m{
  "input": "What are the requirements for studying MSI in masters degree at the HTWG?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:retrieval_chain > 2:chain:RunnableAssign<context> > 3:chain:RunnableParallel<context>] Entering Chain run with input:
[0m{
  "input": "What are the requirements for studying MSI in masters degree at the HTWG?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:retrieval_chain > 2:chain:RunnableAssign<context> > 3:chain:RunnableParallel<context> > 4:chain:retrieve_documents] Entering Chain run with input:
[0m{
  "input": "What are the requirements for studying MSI in masters degree at the HTWG?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:retri


llama_print_timings:        load time =      85.62 ms
llama_print_timings:      sample time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =     355.08 ms /    19 tokens (   18.69 ms per token,    53.51 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =     356.49 ms


[36;1m[1;3m[chain/end][0m [1m[1:chain:retrieval_chain > 2:chain:RunnableAssign<context> > 3:chain:RunnableParallel<context> > 4:chain:retrieve_documents] [375ms] Exiting Chain run with output:
[0m[outputs]
[36;1m[1;3m[chain/end][0m [1m[1:chain:retrieval_chain > 2:chain:RunnableAssign<context> > 3:chain:RunnableParallel<context>] [378ms] Exiting Chain run with output:
[0m[outputs]
[36;1m[1;3m[chain/end][0m [1m[1:chain:retrieval_chain > 2:chain:RunnableAssign<context>] [381ms] Exiting Chain run with output:
[0m{
  "input": "What are the requirements for studying MSI in masters degree at the HTWG?",
  "context": [
    {
      "lc": 1,
      "type": "constructor",
      "id": [
        "langchain",
        "schema",
        "document",
        "Document"
      ],
      "kwargs": {
        "page_content": "•Cloud Application  Development (WS)\n•Concurrency  (SS) \n•Data Analysis (SS) \n•IT-Consulting (WS)\n•IT-Leadership: Entrepreneurship (SS) + IoT \n(SS)\n•Konzepte aktuelle

In [5]:
for chunk in chain.invoke({"input": "What are the requirements for studying MSI in masters degree at the HTWG?"}):
    print(chunk, end="", flush=True)

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "What are the requirements for studying MSI in masters degree at the HTWG?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<context,input>] Entering Chain run with input:
[0m{
  "input": "What are the requirements for studying MSI in masters degree at the HTWG?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<context,input> > 4:chain:RunnablePassthrough] Entering Chain run with input:
[0m{
  "input": "What are the requirements for studying MSI in masters degree at the HTWG?"
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<context,input> > 4:chain:RunnablePassthrough] [1ms] Exiting Chain run with output:
[0m{
  "input": "What are the requirements for studying MSI in masters degree at the HTWG?"
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence


llama_print_timings:        load time =      85.62 ms
llama_print_timings:      sample time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =      79.61 ms /     2 tokens (   39.80 ms per token,    25.12 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =      83.67 ms




Please provide the answer based on the given context.[36;1m[1;3m[llm/end][0m [1m[1:chain:RunnableSequence > 6:llm:LlamaCpp] [4.93s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "\n\nPlease provide the answer based on the given context.",
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 7:parser:StrOutputParser] Entering Parser run with input:
[0m{
  "input": "\n\nPlease provide the answer based on the given context."
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence > 7:parser:StrOutputParser] [0ms] Exiting Parser run with output:
[0m{
  "output": "\n\nPlease provide the answer based on the given context."
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence] [5.04s] Exiting Chain run with output:
[0m{
  "output": "\n\nPlease provide the answer based on the given context."
}


Please provi

Hier wird die Datenbank in langchain eingebunden

In [1]:
model_path = "/home/tpllmws23/llms/llama-2-13b-chat.Q4_K_M.gguf"
database_path = "/home/tpllmws23/Chatbot-LLama-Pruefungsamt/Chatbot-Jan/chromadb"
collection_name = "Pruefungsamt"
n_ctx = 8192

from langchain_core.prompts import ChatPromptTemplate
from langchain.llms.llamacpp import LlamaCpp
from langchain_community.vectorstores.chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
import chromadb

llm = LlamaCpp(model_path=model_path,
        n_gpu_layers=-1,
        n_batch=512,
        n_ctx=n_ctx,
        f16_kv=True,
        verbose=False,
        temperature=0.0,
)

from langchain_community.embeddings import LlamaCppEmbeddings
embedding = LlamaCppEmbeddings(model_path=model_path,
        n_gpu_layers=-1,
        n_batch=512,
        n_ctx=n_ctx,
        f16_kv=True
        )

db = Chroma(
    persist_directory=database_path, 
    collection_name=collection_name, 
    embedding_function=embedding
)
retriever = db.as_retriever()

ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no
ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
ggml_init_cublas: found 1 CUDA devices:
  Device 0: NVIDIA TITAN RTX, compute capability 7.5, VMM: yes
llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from /home/tpllmws23/llms/llama-2-13b-chat.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
llama_model_loader: - kv   4:                          llama.block_count u32              = 40
llama_model_loader: - kv   5:                  llama.feed_forward_length

Einfache Chain

In [5]:
from operator import itemgetter

template = """You are a chatbot and answer questions. Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

chain = (
    {
        "context": itemgetter("question") | retriever,
        "question": itemgetter("question"),
    }
    | prompt
    
    | llm
    | StrOutputParser()
)

response = chain.invoke({"question": "What are the requirements for studying MSI in masters degree at the HTWG?"})
print(response)


llama_print_timings:        load time =     317.06 ms
llama_print_timings:      sample time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =     312.03 ms /    19 tokens (   16.42 ms per token,    60.89 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =     314.53 ms






oder oder.']



oder oder oder oder, oder oder








the







6,4, 3, and.].



oder oder.']



oder oder oder oder, oder oder








the







6,4, 3, and.].


In [2]:
from operator import itemgetter

from langchain.memory import ConversationBufferMemory

from langchain.schema import format_document
from langchain_core.messages import AIMessage, HumanMessage, get_buffer_string
from langchain_core.runnables import RunnableParallel

from langchain.prompts.prompt import PromptTemplate

_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
ANSWER_PROMPT = ChatPromptTemplate.from_template(template)

DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")


def _combine_documents(
    docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, document_separator="\n\n"
):
    doc_strings = [format_document(doc, document_prompt) for doc in docs]
    return document_separator.join(doc_strings)

_inputs = RunnableParallel(
    standalone_question=RunnablePassthrough.assign(
        chat_history=lambda x: get_buffer_string(x["chat_history"])
    )
    | CONDENSE_QUESTION_PROMPT
    | llm
    | StrOutputParser(),
)
_context = {
    "context": itemgetter("standalone_question") | retriever | _combine_documents,
    "question": lambda x: x["standalone_question"],
}
conversational_qa_chain = _inputs | _context | ANSWER_PROMPT | llm

memory = ConversationBufferMemory(
    return_messages=True, output_key="answer", input_key="question"
)

# First we add a step to load memory
# This adds a "memory" key to the input object
loaded_memory = RunnablePassthrough.assign(
    chat_history=RunnableLambda(memory.load_memory_variables) | itemgetter("history"),
)
# Now we calculate the standalone question
standalone_question = {
    "standalone_question": {
        "question": lambda x: x["question"],
        "chat_history": lambda x: get_buffer_string(x["chat_history"]),
    }
    | CONDENSE_QUESTION_PROMPT
    | llm
    | StrOutputParser(),
}
# Now we retrieve the documents
retrieved_documents = {
    "docs": itemgetter("standalone_question") | retriever,
    "question": lambda x: x["standalone_question"],
}
# Now we construct the inputs for the final prompt
final_inputs = {
    "context": lambda x: _combine_documents(x["docs"]),
    "question": itemgetter("question"),
}
# And finally, we do the part that returns the answers
answer = {
    "answer": final_inputs | ANSWER_PROMPT | llm,
    "docs": itemgetter("docs"),
}
# And now we put it all together!
final_chain = loaded_memory | standalone_question | retrieved_documents | answer

inputs = {"question": "What are the requirements for studying MSI in masters degree at the HTWG?"}
result = final_chain.invoke(inputs)
print(result)

# For now you need to save it yourself
memory.save_context(inputs, {"answer": result["answer"].content})
memory.load_memory_variables({})

inputs = {"question": "Are there special requirements for 'Autonome Systeme'?"}
result = final_chain.invoke(inputs)
print(result)


llama_print_timings:        load time =     632.07 ms
llama_print_timings:      sample time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =     630.89 ms /    58 tokens (   10.88 ms per token,    91.93 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =     633.23 ms


{'answer': '\nI have attached the module handbook and the course catalog for your reference. Please let me know if there are any specific requirements or prerequisites that I need to fulfill before applying for the program.\n\nThank you for your time and assistance.', 'docs': [Document(page_content='Hochschule Konstanz Modulhandbuch des Studiengangs\nFakultät InformatikPlaceholder Informatik, Master of Science\nModul ITM06 Strategic IT-Management 1 (Building the IT-Leadership System)\nModul-Koordination Start Modul-Kürzel/-Nr. ECTS-Punkte Arbeitsaufwand\nProf. Dr. C. Rentrop SS SIM1/ITM06 5 150 h\n Dauer SWS Kontaktzeit Selbststudium\n 1 Semester 3 45 h 105 h\nEinsatz des Moduls im\nStudiengangAngestrebter\nAbschlussModul-Typ\n(PM/WPM)Beginn im\nStudiensemesterSPO-Version /\nJahr\nMSI M.Sc. WPM A/B SPO 5 / 2020\nInhaltliche Teilnahme\nVoraussetzung\nVerwendbarkeit des Moduls\nim o.g. StudiengangAls Vorkenntnis erforderlich für Modul:\nSinnvoll zu kombinieren mit Modul:\nPüfungsleistung

AttributeError: 'str' object has no attribute 'content'