In [None]:
# Import necessary packages
from langchain_chroma import Chroma
from langchain_ollama import OllamaLLM, OllamaEmbeddings
# Document processing and retrieval  
from langchain.text_splitter import RecursiveCharacterTextSplitter  # Splits text into smaller chunks for better embedding and retrieval

import json
from langchain.schema import Document

import re  # Provides tools for working with regular expressions, useful for text cleaning and pattern matching

data = [...]: This is a list comprehension, which iterates through each item in json_data.
Document(...): For each item, a new Document is being created (likely a class from LangChain or a custom class designed for this purpose). Each document will contain:
-page_content: A formatted string that combines several fields from the item (e.g., section, question, subtopic, and answer).
-metadata: This is a dictionary containing the section and subtopic from the current item. This metadata can be used later for filtering or querying

In [13]:
with open('data.json', 'r', encoding='utf-8') as file:
    json_data = json.load(file)
    # Convert JSON into LangChain Documents
    data = [
        Document(
            page_content=f"Section: {item['section']}\n"
                        f"Question: {item['question']}\n"
                        f"Subtopic: {item['subtopic']}\n"
                        f"Answer: {' '.join(item['answer'])}",
            metadata={"section": item['section'], "subtopic": item['subtopic']}
        )
        for item in json_data
        ]
second_docuemnt = data[1]
print("Content: \n",second_docuemnt.page_content)
print("Metadata: ",second_docuemnt.metadata)

Content: 
 Section: Real Estate
Question: Why Investing In Real Estate In Tunisia? A Promising Opportunity
Subtopic: Economic Growth And Stability
Answer: Tunisia boasts a growing economy, offering a favorable environment for real estate investment. Economic reforms and improved infrastructure enhance investor confidence.
Metadata:  {'section': 'Real Estate', 'subtopic': 'Economic Growth And Stability'}


In [14]:
# RecursiveCharacterTextSplitter splits the PDF into chunks of 500 characters, keeping 100 characters overlap to keep context 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
# Splits the documents into chunks and stores them in chunks object
chunks = text_splitter.split_documents(data)
print(chunks)
print([len(chunks) for chunk in chunks])

[3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 3202, 320

In [15]:
for i,chunk in enumerate(chunks[:3]):
    print(f"Chunk {i+1}:\n{chunk.page_content}\n")

Chunk 1:
Section: Real Estate
Question: Why Investing In Real Estate In Tunisia? A Promising Opportunity
Subtopic: intrdocution
Answer: I n v e s t i n g   i n   h o m e s   f o r   p e r s o n a l   o r   f a m i l y   u s e   i s   i d e a l   f o r   t h o s e   c o n s i d e r i n g   e s t a b l i s h i n g   t h e m s e l v e s   i n   T u n i s i a .

Chunk 2:
Section: Real Estate
Question: Why Investing In Real Estate In Tunisia? A Promising Opportunity
Subtopic: Economic Growth And Stability
Answer: Tunisia boasts a growing economy, offering a favorable environment for real estate investment. Economic reforms and improved infrastructure enhance investor confidence.

Chunk 3:
Section: Real Estate
Question: Why Investing In Real Estate In Tunisia? A Promising Opportunity
Subtopic: Increasing Demand For Housing
Answer: With a rising population and rapid urbanization, the demand for housing continues to grow. Urban areas like Tunis, Sfax, and Sousse are in high demand.



In [16]:
# Create embeddings using OllamaEmbeddings 
embeddings = OllamaEmbeddings(model="deepseek-r1:1.5b")
# Create a vector database which allows us to store the chunks and their embeddings
vectorstore=Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory="./chroma_db")  # Example directory

In [17]:
retriever = vectorstore.as_retriever(search_type="similarity",search_kwargs={"k":2})

In [18]:
from langchain_core.prompts import ChatPromptTemplate
prompt=ChatPromptTemplate.from_template("""Uses the following piece of context to answer the question at the end.
                                        If you don't know the answer,say that you don't know.
                                        Context : {context}
                                        Question : {question}
                                        """)

In [None]:
llm = OllamaLLM(model="deepseek-r1:1.5b")

In [20]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
chain = (
    {"context":retriever,"question":RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [22]:
result = chain.invoke("Legal Procedures To Consider When Buying A Property")

What is re.DOTALL?
In Python, the dot (.) in regular expressions usually matches any character except for newlines (i.e., \n). However,
if you use the re.DOTALL flag, the dot (.) will match any character, including newlines. This is particularly useful when you're working with multi-line text and want to match across line breaks.
without dot :
<think>Here is some text
that spans multiple lines</think>
In this case, the .*? does not match across newlines (i.e., \n). So, the text inside <think>...</think> is not removed properly. The . only matches characters that are not newlines, and it stops when it reaches a newline.

In [23]:
# Remove the content between <think> and </think>
clean_text = re.sub(r"<think>.*?</think>", "", result, flags=re.DOTALL)
print(clean_text)



When purchasing real estate in Tunisia, the legal procedures typically include:

1. **Zoning and Land Use:** Obtain zoning approvals for the property location, which determines what uses are permitted (residential, commercial, industrial).

2. **Permits:**
   - Real Estate Permits: Purchase may require building, utilities, and sometimes environmental permits.
   - Utility Permits: Separate from real estate, often required separately.

3. **Building and Construction:** Obtain a building plan from an architect for construction. Property agents can facilitate this process.

4. **Legal Process:** Go through a formal appraisal to determine property value before signing contracts or agreements.

5. **Environmental Assessments:** Some regions require assessments, especially for industrial properties.

6. **Legal Consultation:** Seek legal advice from an attorney specializing in real estate transactions.

These steps ensure compliance with local regulations and facilitate smooth property acq