In [5]:
import os
import sys
sys.path.append('../..')

import panel as pn  # GUI
pn.extension()

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
HUGGINGFACEHUB_API_TOKEN = os.environ['HUGGINGFACEHUB_API_TOKEN']
import pandas as pd
#langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_community.llms import HuggingFaceHub
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
from langchain.schema.runnable import Runnable
from langchain.schema.runnable.config import RunnableConfig
from pydantic import BaseModel
from langchain.chains import (
    LLMChain, ConversationalRetrievalChain
)
from langchain.chains import ConversationalRetrievalChain

import shutil
from langchain.vectorstores import Chroma

from langchain.memory import ConversationBufferMemory
from langchain.chains import StuffDocumentsChain, LLMChain

from langchain.prompts.prompt import PromptTemplate
from langchain.prompts.chat import ChatPromptTemplate, SystemMessagePromptTemplate

from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate

from langchain.document_loaders import PyPDFDirectoryLoader

In [6]:
    loader = PyPDFDirectoryLoader(r"C:\Users\alvar\Desktop\UCM-TFM-G1\data\LLM\pdfs")
    data=loader.load()
    # split documents
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
    docs = text_splitter.split_documents(data)
    # define embedding
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-l6-v2')
    # create vector database from data
    persist_directory = 'docs/chroma/'

    # Remove old database files if any
    shutil.rmtree(persist_directory, ignore_errors=True)
    vectordb = Chroma.from_documents(
        documents=docs,
        embedding=embeddings,
        persist_directory=persist_directory
    )
    # define retriever
    retriever = vectordb.as_retriever(search_type="mmr")
    context = retriever
    template = """Your name is AngryGreta and you are a recycling chatbot created to help people. Use the following pieces of context to answer the question at the end. Answer in the same language of the question. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
    CONTEXT: {context}
    CHAT HISTORY:
    {chat_history}
    Question: {question}
    Helpful Answer:"""
    
    # Create the chat prompt templates
    messages = [
        SystemMessagePromptTemplate.from_template(template),
        HumanMessagePromptTemplate.from_template("{question}")
    ]

    qa_prompt = ChatPromptTemplate.from_messages(messages)
    
    llm = HuggingFaceHub(
    repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
    task="text-generation",
    model_kwargs={
        "max_new_tokens": 512,
        "top_k": 30,
        "temperature": 0.1,
        "repetition_penalty": 1.03,
    },
    )
    llm_chain = LLMChain(llm=llm, prompt=qa_prompt)
    
    memory = ConversationBufferMemory(llm=llm, memory_key="chat_history", output_key='answer', return_messages=True)
    
    qa_chain = ConversationalRetrievalChain.from_llm(
        llm = llm,
        memory = memory,
        retriever = retriever,
        verbose = True,
        combine_docs_chain_kwargs={'prompt': qa_prompt},
        get_chat_history = lambda h : h
    )

In [7]:
retriever

VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x0000025E965C3650>, search_type='mmr')

In [14]:
history = []

while True:
    user_query = input("You: ")
    if user_query.lower() == "exit":
        print("Chatbot: Thanks!")
        break
    
    response = qa_chain({"question": user_query, "chat_history": history})
    
    history.append((user_query, response))
    
    print("QUERY: ", user_query)
    print("ANSWER: ", response['answer'])
    print("HISTORY: ", history)

You: Dime tu nombre y explicame cómo reciclar una lata de aluminio


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Your name is AngryGreta and you are a recycling chatbot created to help people. Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
CONTEXT: sent to furnaces and smelters, with the amount adjusted to take the source of the waste into 
account.  
It is also important to note that, for the aluminium and steel packaging waste  recycling  
targets, the weight of material counted as recycle d is not ‘pure’ metal, but an aluminium or 
steel product that may contain alloying elements account ing for a few percent of the total 
mass of the metal. These alloying elements are an inte


[1m> Finished chain.[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Your name is AngryGreta and you are a recycling chatbot created to help people. Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
CONTEXT: annual reporting MWRO que stionnaire, detailed in Section 4 below .  
 
5 https://eur -lex.europa.eu/legal -content/EN/TXT/?uri=CELEX%3A32011D0753  
6 https://ec.europa.eu/environment/archives/waste/reporting/pdf/C_2012_2384.pdf

Guidance for the compilation and reporting of data on municipal  waste  _________________________    5 1 Introduction  
The purpose of this document is to provide guidance to Member States on the reporting of 
municipal  waste data , pursuant to the

KeyboardInterrupt: Interrupted by user

In [9]:
from huggingface_hub import HfApi
api = HfApi()

api.upload_folder(
    folder_path=r"C:\Users\alvar\Desktop\UCM-TFM-G1\data\LLM\pdfs",
    repo_id="ALVHB95/TFM_DataScience_APP",
    repo_type="space",
)

Acceptable-Ways-to-separate-and-dispose-of-garbage-and-recyclables.pdf:   0%|          | 0.00/5.41M [00:00<?, …

mygov-999999999489028046.pdf:   0%|          | 0.00/1.43M [00:00<?, ?B/s]

Advice-on-recycling-and-resource-recovery-FINAL-REPORT.pdf:   0%|          | 0.00/9.13M [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

Guidance on municipal waste data collection.pdf:   0%|          | 0.00/2.50M [00:00<?, ?B/s]

'https://huggingface.co/spaces/ALVHB95/TFM_DataScience_APP/tree/main/'