# Ollama PDF RAG Notebook

## Import Libraries


In [9]:
# Imports
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Jupyter-specific imports
from IPython.display import display, Markdown

# Set environment variable for protobuf
import os
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

## Load PDF

In [10]:
# Load PDF
local_path = "/mnt/c/Users/schaduvu/Documents/0. AI/development/llama/ollama_pdf_rag/data/pdfs/sample/devNext.pdf"
if local_path:
    loader = UnstructuredPDFLoader(file_path=local_path)
    data = loader.load()
    print(f"PDF loaded successfully: {local_path}")
else:
    print("Upload a PDF file")

Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBB

PDF loaded successfully: /mnt/c/Users/schaduvu/Documents/0. AI/development/llama/ollama_pdf_rag/data/pdfs/sample/devNext.pdf


## Split text into chunks

In [11]:
# Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(data)
print(f"Text split into {len(chunks)} chunks")

Text split into 215 chunks


## Create vector database

In [12]:
# Create vector database
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=OllamaEmbeddings(model="nomic-embed-text"),
    collection_name="local-rag"
)
print("Vector database created successfully")

Vector database created successfully


## Set up LLM and Retrieval

In [13]:
# Set up LLM and retrieval
local_model = "llama3.2"  # or whichever model you prefer
llm = ChatOllama(model=local_model)

In [6]:
# Query prompt template
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate 2
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

# Set up retriever
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)

## Create chain

In [14]:
# RAG prompt template
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [15]:
# Create chain
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

NameError: name 'retriever' is not defined

## Chat with PDF

In [9]:
def chat_with_pdf(question):
    """
    Chat with the PDF using the RAG chain.
    """
    return display(Markdown(chain.invoke(question)))

In [60]:
# Example 1
chat_with_pdf("What is the main idea of this context?")

The main idea of this context appears to be documentation or guidelines for using devNext, a development platform, specifically for tasks related to artifact management, such as uploading and managing artifacts in an Artifactory repository. The context includes information on configuring profiles, setting up access control, and authenticating with GitLab using Personal Access Tokens (Pats).

In [None]:
chat_with_pdf("What is the main idea of this context?")

'The document appears to be related to DevNext, an integrated development environment (IDE), specifically its features and configuration options for uploading and managing build artifacts to Artifactory repositories.'

In [10]:
# Example 3
chat_with_pdf("How to install devNext?")

According to the provided context, there are multiple ways to install devNext:

1. Using the devNextv3-install.sh script:
   - Download the script.
   - Run `chmod +x devNextv3-install.sh` to make it executable.
   - Run `source devNextv3-install.sh` to start the installation process.

2. Manually following the setup instructions in the devNext Wiki.

3. Using pip:
   - On Linux, run `python3 -m pip install devNext --extra-index-url https://jfrog.sofia.visteon.com/artifactory/api/pypi/pypi-virtual/simpl`
   - On Windows, run `python3 -m pip install devNext`

In [64]:
# Example 4
chat_with_pdf("which versions support devNext? tabular format")

Here are the supported versions of OS, Python, PIP, Repo Tool, and Windows for devNext:

| OS | Python Version | PIP Version | Repo Tool | Windows |
| --- | --- | --- | --- | --- |
| Ubuntu 20.04 | v3.8.10 | v23.3.2 | Repo_Launcher 2.29.9 | v3.12.8 |
| Ubuntu 22.04 | v3.10.6 | v23.3.2 | Repo_Launcher 2.29.9 | v3.12.8 |
| Ubuntu 24.04 | v3.12.3 | v23.3.2 | Repo_Launcher 2.29.9 | v3.12.8 |
| Windows 10 | v3.12.8 | v23.3.2 | Repo_Launcher 2.29.9 | v3.12.8 |
| Windows 11 | v3.12.8 | v23.3.2 | Repo_Launcher 2.29.9 | v3.12.8 |

Note: Older versions (Ubuntu 18.04) have backward compatibility support for devNext, but the specific versions are:

| OS | Python Version | PIP Version | Repo Tool |
| --- | --- | --- | --- |
| Ubuntu 18.04 | v3.6.9 | v21.3.x | Repo 2.8 |

In [65]:
# Example 5
chat_with_pdf("what is dn?")

The text doesn't explicitly define what "dn" is, but based on the context of the provided documents, it appears that "dn" is an alias for "devNext", a Workspace Management Tool.

## Clean up (optional)

In [11]:
chat_with_pdf("devnext learning video links")

According to the provided context, the correct answer is:

* Gitlab Presentation Deck (https://eu.git.visteon.com/devnext/devnext_v3/-/wikis/home)
* devNext Training Video (location not specified)

These are mentioned in the Document's page_content as learning resources for devNext.

In [12]:
chat_with_pdf("frequent issues from devNext")

Based on the provided context, some potential frequent issues from devNext could be:

1. Issues with license requests being denied (as seen in the sample profile file).
2. Problems with usernames and CDSID (as mentioned in the sample profile file).
3. Incompatibility with older versions of OS and software.
4. Issues with WSL version compatibility (as mentioned in the table of contents).
5. Errors related to devNext installation, configuration, or workspace setup.
6. Compatibility issues between different versions of Python, PIP, and Repo tools.
7. Problems with Conan management of third-party softwares.

Please note that these are just potential issues based on the provided context and may not be an exhaustive list of frequent issues from devNext.

In [1]:
chat_with_pdf("what is profile file")

NameError: name 'chat_with_pdf' is not defined

In [None]:
chat_with_pdf("can you explain the above issues in brief with")

Based on the provided context, here is a brief explanation of the issues:

1. **WSL (Windows Subsystem for Linux) Issues**: The user needs to enable Hyper-V and Windows Subsystem for Linux to resolve network issues.

2. **Tasking Compiler License Configuration**: The user needs to configure the license keys based on their location using the `TSK_LICENSE_KEY_SW160800` and `TSK_LICENSE_SERVER` variables.

3. **Static Code Analysis (Klocwork)**: A new subcommand has been introduced to execute FDA for easier developer flow, which is part of the static code analysis process.

4. **devNext Command Line Interface**: The user needs to define various options such as `-type`, `-env`, and `-location` when running the `build` command.

5. **devNext Analyze Command**: A new subcommand has been introduced to execute FDA for easier developer flow, which is part of the static code analysis process.

6. **pip Issues**: The user needs to specify the Python version (e.g., `python3`) and the location of the pip package repository when installing packages using pip.

7. **Artifact Management**: The user needs to configure the artifact server, repositories, output path, and excluded files when uploading artifacts.

8. **Git Issues**: The user needs to set up their Git environment correctly, including enabling Hyper-V and Windows Subsystem for Linux, and configuring the Git repository for a project.

9. **devNext Videos**: A link to the devNext videos page is provided.

10. **Q2A Forum and FAQs**: Links to the Q2A forum and FAQs are provided for users to find answers to their questions.

In [16]:
chat_with_pdf("can you explain the above issues in brief with solutions in tabular format")

Based on the provided context, here are the issues explained briefly along with potential solutions in a tabular format:

| Issue | Description | Solution |
| --- | --- | --- |
| WSL 1 not compatible with devNext | The default WSL version is 1, which is not compatible with devNext. | Upgrade to WSL 2 and convert WSL 1 to WSL 2. Run the command `wsl --set-version <distro_name> 2` to make the future distro default to v2. |
| Artifact upload and retention policy | The feature is available only in devNext V4.0.0, and it supports currently Jenkins CI. | Check if your version of devNext supports this feature. If not, upgrade or use an alternative method. Ensure compliance with the repository's configured retention policy by passing the required information in the profile file (.ini file). |
| Multiple profiles for license configuration | License configuration requires multiple profiles. | Use the Tasking Compiler License Compliance feature to streamline the license configuration process and eliminate the need for multiple profiles. Configure license keys based on location using dynamic configuration. |
| Long argument list in the command line | The command line argument list is too long, causing issues. | Consider breaking down the command into smaller sub-commands or optimizing the current command structure. Use alternatives like `dn ci build --type <build_type> --profile <path_to_profile_file> --env TSK_LICENSE_KEY_PROF=<PROF_LICENSE_KEY>` to pass arguments more efficiently. |
| Incorrect Python version being accessed | Using Python or Py instead of python3 accesses python version-3.x or above. | Replace Python or py with python3 in the commands to ensure the correct Python version is accessed.

In [17]:
chat_with_pdf("can you explain the above issues in brief with solutions in tabular format")

Here is a brief explanation of the issues and their solutions in a table format:

| Issue # | Description | Solution |
| --- | --- | --- |
| 1 | WSL2 Network Issue | Restart computer, Enable Hyper-V, Virtual Machine platform, Windows Subsystem for Linux |
| 2 | WSL2 Not releasing disk space to host OS (WINDOWS) | WSL ShutDown, Optimize-vhd, Disk part |
| 3 | devNext Pre requisites | WSL2 setup is mandatory, Python version >= 3.8.10 (recommended: v3.9.5), Set default python user site in Environment Variables |

Note:

* The "Restart computer" solution applies to both WSL2 Network Issue and WSL2 Not releasing disk space to host OS.
* The "Enable Hyper-V" solution is a general step that needs to be taken before proceeding with other steps for the WSL2 Network Issue.
* The "WSL Distro Configuration" table is not explicitly listed as an issue, but it is mentioned in the documentation as part of the devNext installation process. It includes setting up WSL2, Python user home, Git configuration, QNX license management, and PIP version upgrade.

Please note that these solutions are based on the provided context and may require additional steps or variations depending on individual system configurations.

In [27]:
# Optional: Clean up when done 
vector_db.delete_collection()
print("Vector database deleted successfully")

Vector database deleted successfully


In [18]:

!pip show pdfminer


Name: pdfminer
Version: 20191125
Summary: PDF parser and analyzer
Home-page: http://github.com/euske/pdfminer
Author: Yusuke Shinyama
Author-email: yusuke@shinyama.jp
License: MIT
Location: /mnt/c/Users/schaduvu/Documents/0. AI/development/ai/lib/python3.12/site-packages
Requires: pycryptodome
Required-by: 


In [19]:

import pdfminer.pdfinterp
print(dir(pdfminer.pdfinterp))


['BytesIO', 'CMap', 'CMapBase', 'CMapDB', 'Color', 'Dict', 'KWD', 'LIT', 'LITERALS_ASCII85_DECODE', 'LITERAL_FONT', 'LITERAL_FORM', 'LITERAL_IMAGE', 'LITERAL_PDF', 'LITERAL_TEXT', 'List', 'MATRIX_IDENTITY', 'Mapping', 'Matrix', 'Optional', 'PDFCIDFont', 'PDFColorSpace', 'PDFContentParser', 'PDFDevice', 'PDFException', 'PDFFont', 'PDFFontError', 'PDFGraphicState', 'PDFInterpreterError', 'PDFObjRef', 'PDFPage', 'PDFPageInterpreter', 'PDFResourceError', 'PDFResourceManager', 'PDFStackT', 'PDFStream', 'PDFTextSeq', 'PDFTextState', 'PDFTrueTypeFont', 'PDFType1Font', 'PDFType3Font', 'PDFValueError', 'PREDEFINED_COLORSPACE', 'PSEOF', 'PSKeyword', 'PSLiteral', 'PSStackParser', 'PSStackType', 'PSTypeError', 'PathSegment', 'Point', 'Rect', 'Sequence', 'Tuple', 'Union', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'cast', 'choplist', 'dict_value', 'keyword_name', 'list_value', 'literal_name', 'log', 'logging', 'mult_matrix', 're', 'reso

In [20]:

from pdfminer.pdfinterp import PDFStackT


In [21]:

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
