In [1]:
!pip install "unstructured[all-docs]"
!pip install chromadb
!pip install langchain
!pip install ollama
!pip install langchain_community

In [2]:
from langchain.document_loaders import DirectoryLoader

dir_path='./data'

def load_documents(data_path):
    loader= DirectoryLoader(data_path, glob='*')
    return loader.load()

documents=load_documents(dir_path)
# print(documents)

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_text(documents:list):
    text_spliter= RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=500,
        length_function=len,
        add_start_index=True
    )

    chunks=text_spliter.split_documents(documents)
    return chunks

chunks=split_text(documents)
# print(chunks) 

In [4]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain.vectorstores import Chroma


ollama_emb = OllamaEmbeddings(model="phi3")
vectorstore = Chroma.from_documents(documents=chunks, embedding=ollama_emb,persist_directory="./chroma_db")

In [9]:
from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler 
from langchain import PromptTemplate
from langchain.chains import RetrievalQA


template = """Use the following pieces of context to answer the question at the end.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.
    Use three sentences maximum and keep the answer as concise as possible.
    {context}
    Question: {question}
    Helpful Answer:"""

QA_CHAIN_PROMPT = PromptTemplate(
        input_variables=["context", "question"],
        template=template,
    )

llm = Ollama(model="phi3", 
             callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]))


qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectorstore.as_retriever(),
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
)


YOUR_QUESTION = 'What is the conclusion?'
result = qa_chain({"query": YOUR_QUESTION })

 Based on the provided data in Table 6, which shows word class frequency by age for different categories such as "money job," "sports," "tv," "sleep," "eating," "sex," "family," "friends," "pos-emotions," and "neg-emotions," we can draw several conclusions regarding language use across ages. However, it's essential to note that the data presented includes standard errors (±0.5 or ±1.9) alongside mean values for each category per age group. This allows us to understand not just central tendencies but also variability and reliability of these measures.

Here are some general observations:

1. "Eating" related terms seem more prominent in the younger (20s and 30s) groups, with noticeable declines as age increases. This might suggest that food-related language becomes less prevalent or shifts towards different topics as individuals mature.
   
2. "Sex" related words are notably higher among middle-aged adults (30s), which could reflect cultural attitudes and societal openness to discussing