# Install necessary packages
```bash 

%pip install --quiet unstructured langchain chromadb "unstructured[all-docs]" langchain-text-splitters

```

In [1]:
from concurrent.futures import ThreadPoolExecutor
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
from langchain_community.document_loaders import PyPDFLoader


def load_file_preprocess_parallel(local_paths):

    def process_file(local_path):
        if os.path.exists(local_path):
            loader = PyPDFLoader(local_path)
            print(f"Loading PDF from {local_path}...")
            try:
                data = loader.load_and_split()
                text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=350)
                chunks = text_splitter.split_documents(data)
                print(f"PDF from {local_path} loaded successfully.")
                return chunks
            except Exception as e:
                print(f"Error loading PDF from {local_path}: {e}")
                return []
        else:
            print(f"File not found: {local_path}")
            return []

    with ThreadPoolExecutor() as executor:
        all_chunks = executor.map(process_file, local_paths)
    # Flatten list of lists
    return [chunk for sublist in all_chunks for chunk in sublist]

# Vector Embeddings

In [2]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
import os

def load_embedding_model(embedding_model = "nomic-embed-text:latest",file_path = ['IT445BOOKEDIT.pdf']):
    try:
        file_path = [
            'IT445BOOKEDIT.pdf', '21AI641 MOD 1.pdf', '21AI641 MOD 2.pdf',
            '21AI641 MOD 3.pdf', '746127128-BI-module-4-notes-1.pdf', 'module 5 21ai641.pdf'
        ]
        if not os.path.exists('vector_data'):
            chunks = load_file_preprocess_parallel(file_path)
            # Create a new vector database if it doesn't exist
            vector_db = Chroma.from_documents(
                documents=chunks, 
                embedding=OllamaEmbeddings(model=embedding_model, show_progress=True),
                collection_name="local-rag",
                persist_directory="vector_data"
            )
            vector_db.persist()
            print("New vector database created.")
        else:
            embedding = OllamaEmbeddings(model=embedding_model,show_progress=True)
            # Load the existing vector database
            vector_db = Chroma(
                collection_name="local-rag",
                persist_directory="vector_data",
                embedding_function=embedding,
            )
            vector_db.min_results=10
            vector_db.get()
            print("Loaded existing Chroma database from disk.")
    except Exception as e:
        print(f"Error initializing vector database or embeddings: {e}")
    return vector_db

# Retrieval

In [3]:
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_core.runnables import RunnablePassthrough
vector_db = load_embedding_model()
# Initialize the LLM
local_model = "gemma2:2b"
prompt_local_model = "llama3.1:latest"

llm = ChatOllama(model=local_model)
prompt_llm = ChatOllama(model=prompt_local_model,temperature=0)

# Multi-query prompt for better retrieval
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from
    a vector database. While generating new question the meaning of original question should not change. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

try:
    # Set up the retriever
    retriever = MultiQueryRetriever.from_llm(
        vector_db.as_retriever(), 
        prompt_llm,
        prompt=QUERY_PROMPT,
    )
    print("Retriever initialized successfully.")
except Exception as e:
    print(f"Error initializing retriever: {e}")


  warn_deprecated(


Loaded existing Chroma database from disk.
Retriever initialized successfully.


In [5]:
# RAG prompt template
template = """The answer should satisfy a 10-mark question typically asked in a university exam. The answer you provide should contain at least 500 words, using simple language, and should not omit key points or important technical terms. 

Please answer the question based ONLY on the following context:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

try:
    # Create the RAG chain
    chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )

    # Get user input for the question
    question = '''What information is provided by the descriptive snalytics employed at Mapie.
What type of support is provided by the predictive anlaytics employed at Mapie 
sensing?'''
    result = chain.stream(question)
    for chunk in result:
        print(chunk,end='',flush=True)
except Exception as e:
    print(f"Error during RAG chain execution: {e}")


OllamaEmbeddings: 100%|██████████| 1/1 [00:01<00:00,  1.25s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 10.50it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 16.55it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  7.08it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 18.60it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 10.63it/s]


The provided text focuses on general dashboards and their structure, not specifically on a company called "Mapie."  

Here's what we can infer about descriptive analytics from the text:

* **Descriptive Analytics:** This type of analysis focuses on summarizing past data to understand current trends. The document mentions these types of insights are often presented in dashboards, with the information categorized into three layers:
    * **Monitoring:**  Visually showing key performance metrics to track overall health (e.g., sales revenue).
    * **Analysis:** Summarizing dimensional data for deeper insights into problem causes (e.g., identifying why sales are declining). 
    * **Management:** Detailed operational data leading to actions like resolving issues or making strategic decisions (e.g., how to adjust inventory levels based on recent sales).

**Predictive Analytics Support**:  The text does *not* mention predictive analytics in relation to a company named "Mapie". To understand 

In [6]:

# Get user input for the question
question = '''What information is provided by the descriptive snalytics employed at Mapie.
What type of support is provided by the predictive anlaytics employed at Mapie 
sensing?'''
result = chain.stream(question)
for chunk in result:
    print(chunk,end='',flush=True)


OllamaEmbeddings: 100%|██████████| 1/1 [00:01<00:00,  1.25s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 10.21it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 19.64it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  7.01it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 19.07it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 10.42it/s]


This document does not provide specific information about **Mapie's descriptive or predictive analytics capabilities**.  

Here's why:

* **Focus on Dashboard Design:** The text primarily focuses on describing dashboard design and its components, particularly in the context of business intelligence (BI) systems. 
* **Lack of Specific Case Studies:** It mentions a "Application Case" but doesn't delve into details about any specific company or Mapie's analytics solutions.
* **Limited Scope of Text:** This excerpt is only a portion of a larger document dealing with BI and dashboard design; it doesn't provide the context needed for mapping out Mapie's analytics capabilities.

**To find this information, you'd likely need to explore these sources:**

1. **Mapie's Website/Documentation:** Check if they have a dedicated section on their website or product documentation that details their analytics offerings and case studies.
2. **Research Papers & Publications:** Explore academic papers, whit

In [34]:

# Get user input for the question
question = '''What information is provided by the descriptive snalytics employed at Mapie.
What type of support is provided by the predictive anlaytics employed at Mapie 
sensing?'''
result = chain.stream(question)
for chunk in result:
    print(chunk,end='',flush=True)

OllamaEmbeddings: 100%|██████████| 1/1 [00:01<00:00,  1.24s/it]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 10.62it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 18.49it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  7.03it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 17.36it/s]


The text you've provided doesn't give specific examples or details about Mapie's analytics practices, so I can only offer general information and insights based on what the excerpt suggests:

**Descriptive Analytics at Mapie:**

* **Overall Performance Visualization:**  Mapie likely uses descriptive analytics to provide a consolidated view of the organization's performance. This involves presenting data in a clear and concise format, often using dashboards or reports to show key performance indicators (KPIs) across different areas of the business. 
* **Historical & Predictive Data:** The text mentions "current and forecasted values" which suggests that Mapie utilizes both historical data for trends analysis as well as forecasting capabilities to predict future trends.

**Predictive Analytics at Mapie:**

* **Forecasting Capabilities:**  The text highlights predictive analytics' role in forecasting, implying Mapie is capable of using statistical models and machine learning to predict fu