In [3]:
# libaries 

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.llms import CTransformers
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_community.llms import Ollama

from transformers import AutoTokenizer
from IPython.display import display, HTML
import json
import time
import pathlib
import os

In [4]:
# Reading txt files

# define what documents to load and where from
class UTF8TextLoader(TextLoader):
    def __init__(self, file_path: str, encoding: str = 'utf-8'):
        super().__init__(file_path, encoding)
        
loader = DirectoryLoader("./", glob="*.txt", loader_cls=UTF8TextLoader)

# interpret information in the documents
documents = loader.load()
splitter = CharacterTextSplitter()
texts = splitter.split_documents(documents)
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'})

# create and save the local database
db = FAISS.from_documents(texts, embeddings)
db.save_local("faiss")



In [5]:

# Set the path to where you have the text files
directory_path = r"Z:\Uni-CE901\22-24_CE901-CE911-CF981-SU_stoyles_thomas\Data"  # Change this to your directory

# Initialize the tokenizer 
tokenizer = AutoTokenizer.from_pretrained("gpt2") # Change if you wish to test other tokenizers

# Counting tokens in files and directory
def count_tokens_in_file(file_path, tokenizer):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    tokens = tokenizer.encode(text)
    return len(tokens)

def check_tokens_in_directory(directory_path, tokenizer, max_tokens=512):
    for file_name in os.listdir(directory_path):
        if file_name.endswith('.txt'):
            file_path = os.path.join(directory_path, file_name)
            num_tokens = count_tokens_in_file(file_path, tokenizer)
            print(f"File: {file_name}, Tokens: {num_tokens}")
            if num_tokens > max_tokens:
                print(f"WARNING: {file_name} exceeds the maximum token limit of {max_tokens}")

# Check token counts in the directory
check_tokens_in_directory(directory_path, tokenizer, max_tokens=512) # Adjust max_tokens to your max token count


Token indices sequence length is longer than the specified maximum sequence length for this model (1231 > 1024). Running this sequence through the model will result in indexing errors


File: Anxiety_Causes.txt, Tokens: 271
File: Anxiety_HelpingFriends.txt, Tokens: 950
File: Anxiety_PanicAttacks.txt, Tokens: 117
File: Anxiety_symptoms.txt, Tokens: 204
File: Anxiety_WhatToDo.txt, Tokens: 262
File: Anxiety_WhatToSay.txt, Tokens: 399
File: Depression.txt, Tokens: 1231
File: Depression_Causes.txt, Tokens: 774
File: Depression_HelpingFriends.txt, Tokens: 767
File: Depression_HowToTell&Living.txt, Tokens: 213
File: Depression_Symptoms.txt, Tokens: 271
File: Depression_Treatment&Doctor.txt, Tokens: 207
File: Depression_Treatments.txt, Tokens: 1231
File: Depression_Types.txt, Tokens: 519
File: Depression_WhatToSay.txt, Tokens: 512
File: Depression_WhenToSeeDoctor&Treatments.txt, Tokens: 207
File: Depression_WhenToSeeDoctor.txt, Tokens: 207
File: Stress_Causes.txt, Tokens: 373
File: Stress_HelpingFriends.txt, Tokens: 491
File: Stress_Symptoms.txt, Tokens: 427
File: Stress_WhatIsIt.txt, Tokens: 128
File: Stress_WhatToDo.txt, Tokens: 770


In [6]:
# prepare the template we will use when prompting the response
template = """Context: {context} Question: {question} """

# load the language model 
config = {'max_new_tokens': 512, 'temperature': 0.01}
llm = Ollama(model="llama3") # Use Ollama website for more models https://ollama.com/library

# Enable dangerous deserialization
if hasattr(llm, 'allow_dangerous_deserialization'):
    llm.allow_dangerous_deserialization = True

# load the interpreted information from the local database
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'})
db = FAISS.load_local("faiss", embeddings, allow_dangerous_deserialization=True)

# prepare a version of the llm pre-loaded with the local content
retriever = db.as_retriever(search_kwargs={'k': 5}) # Max number of documents to read (k)
prompt = PromptTemplate(
    template=template,
    input_variables=['context', 'question'])

QA_LLM = RetrievalQA.from_chain_type(llm=llm,
                                     chain_type='stuff',
                                     retriever=retriever,
                                     return_source_documents=True,
                                     chain_type_kwargs={'prompt': prompt})



In [7]:
def truncate_tokens(text, max_tokens, tokenizer):
    tokens = tokenizer.encode(text, truncation=True, max_length=max_tokens)
    return tokenizer.decode(tokens[:max_tokens])

In [8]:
# Define query

def query(model, question):
    model_path = model.combine_documents_chain.llm_chain.llm.model
    model_name = pathlib.Path(model_path).name
    time_start = time.time()
    output = model({'query': question})
    response = output["result"]
    time_elapsed = time.time() - time_start

    # Extract and display the source document
    source_document = output['source_documents'][0].page_content

    # Display information
    display(HTML(f'<code>{model_name} response time: {time_elapsed:.02f} sec</code>'))
    display(HTML(f'<strong>Question:</strong> {question}'))
    display(HTML(f'<strong>Answer:</strong> {response}'))
    display(HTML(f'<strong>Source Document:</strong> <pre>{source_document}</pre>'))

In [9]:
# Define a function to retrieve and print documents from FAISS
def retrieve_and_print(query, db, retriever):
    # Retrieve documents
    docs = db.similarity_search(query)
    # Print the retrieved documents
    for i, doc in enumerate(docs):
        print(f"Document {i+1}:\n{doc.page_content}\n")
    return docs

# Query the FAISS database and print the retrieved documents, where to ask the question
query = "What symptoms do Stress and Anxiety share?"
retrieved_docs = retrieve_and_print(query, db, tokenizer)

Document 1:
Symptoms of anxiety
Anxiety can cause many different symptoms. It might affect how you feel physically, mentally and how you behave.
It's not always easy to recognise when anxiety is the reason you're feeling or acting differently.
Physical symptoms
faster, irregular or more noticeable heartbeat
feeling lightheaded and dizzy
headaches
chest pains
loss of appetite
sweating
breathlessness
feeling hot
shaking
Mental symptoms
feeling tense or nervous
being unable to relax
worrying about the past or future
feeling tearful
not being able to sleep
difficulty concentrating
fear of the worst happening
intrusive traumatic memories
obsessive thoughts
Changes in behaviour
not being able to enjoy your leisure time
difficulty looking after yourself
struggling to form or maintain relationships
worried about trying new things
avoiding places and situations that create anxiety
compulsive behaviour, such as constantly checking things

Document 2:
Symptoms of stress
Stress can ause many diffe