In [1]:
import ollama
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [2]:
loader = CSVLoader(file_path="D:Transaction_Monitoring/biography_sample.csv", csv_args={
        "delimiter": ","},)

data = loader.load()

In [3]:
data = data[:1]

In [4]:
data

[Document(page_content='bef_rep: [USA SANCTIONS - OFAC] SDN Ref No 16409 - SDNTK (Specially Designated Narcotics Trafficker Kingpin) (Jul 2014 - addition). PRIMARY NAME: MORENO TUBERQUIA, Carlos Antonio (a.k.a. "NICOLAS"); DOB 30 Apr 1977; POB Monteria, Cordoba, Colombia; citizen Colombia; Cedula No. 11002975 (Colombia). [BIOGRAPHY] High-ranking member of the Los Urabenos criminal organisation (Jun 2003 - Dec 2014). [IDENTIFICATION] ID No: 11002975.', metadata={'source': 'D:Transaction_Monitoring/biography_sample.csv', 'row': 0})]

In [4]:

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(data)

In [5]:
splits

[Document(page_content='bef_rep: [USA SANCTIONS - OFAC] SDN Ref No 16409 - SDNTK (Specially Designated Narcotics Trafficker Kingpin) (Jul 2014 - addition). PRIMARY NAME: MORENO TUBERQUIA, Carlos Antonio (a.k.a. "NICOLAS"); DOB 30 Apr 1977; POB Monteria, Cordoba, Colombia; citizen Colombia; Cedula No. 11002975 (Colombia). [BIOGRAPHY] High-ranking member of the Los Urabenos criminal organisation (Jun 2003 - Dec 2014). [IDENTIFICATION] ID No: 11002975.', metadata={'source': 'D:Transaction_Monitoring/biography_sample.csv', 'row': 0})]

In [6]:
# 2. Create Ollama embeddings and vector store

embeddings = OllamaEmbeddings(model="llama3")
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)

In [7]:

def ollama_llm(question, context):
    formatted_prompt = f"Question: {question}\n\nContext: {context}"
    response = ollama.chat(model='llama3', messages=[{'role': 'user', 'content': formatted_prompt}])
    return response['message']['content']

In [8]:
# 4. RAG Setup
retriever = vectorstore.as_retriever()
def combine_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)
    
def rag_chain(question):
    retrieved_docs = retriever.invoke(question)
    formatted_context = combine_docs(retrieved_docs)
    return ollama_llm(question, formatted_context)

In [9]:
# 5. Use the RAG App

result = rag_chain("list out all the entities present")
print(result)

Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1


Here are all the entities mentioned:

1. **USA SANCTIONS - OFAC** (Office of Foreign Assets Control)
2. **SDN Ref No 16409** (Specially Designated Narcotics Trafficker Kingpin)
3. **SDNTK** (Specially Designated Narcotics Trafficker Kingpin)
4. **MORENO TUBERQUIA, Carlos Antonio** (a.k.a. "NICOLAS")
5. **Los Urabenos** (criminal organisation)
6. **Colombia** (country and citizen of Carlos Moreno Tuburquia)
7. **Monteria, Cordoba** (city and department in Colombia)


In [10]:
# 5. Use the RAG App

result = rag_chain("what is the data of birth")
print(result)

Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1


The data of birth mentioned in the text is:

* Date of Birth: April 30, 1977
* Place of Birth: Monteria, Cordoba, Colombia


In [11]:
result = rag_chain("is there any ID present")
print(result)

Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1


Yes, there is an ID number mentioned in the text:

* Cedula No. 11 002975 (Colombia)

This refers to a Colombian identification card, also known as a cédula.


In [None]:
result = rag_chain("is there any ID present")
print(result)

In [13]:
loader = CSVLoader(file_path="D:Transaction_Monitoring/biography_sample.csv", csv_args={
        "delimiter": ","},)

data = loader.load()

data = data[4:5]

print(data)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(data)

# 2. Create Ollama embeddings and vector store

embeddings = OllamaEmbeddings(model="llama3")
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)



[Document(page_content="bef_rep: [USA SANCTIONS - OFAC] SDN Ref No 8884 - SDGT (Specially Designated Global Terrorist) (Apr 2005 - addition). PRIMARY NAME: AL-HIYARI,Bilal Mansur. Alias: AL-KHAYARI,Bilal Mansur Mahmud. Address - Suwaylah, Jordan. DOB circa 1969. POB al-Salt, Jordan. Nationality - Jordanian. Suwaylah, Jordan. [BIOGRAPHY] Suspected terrorism financier. Believed to be Al-Qaida's chief money man in Iraq. [IDENTIFICATION] To be determined. [FUNDING] Reportedly provided financial support to Jam'at al Tawhid wa'al-Jihad, also known as the Zarqawi network in Iraq.", metadata={'source': 'D:Transaction_Monitoring/biography_sample.csv', 'row': 4})]


In [15]:
result = rag_chain("what is the date of birth")
print(result)

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


The dates of birth are:

* Carlos Antonio Moreno Tuburquia (SDN Ref No 16409): April 30, 1977
* Bilal Mansur Al-Hiyari (SDN Ref No 8884): circa 1969


In [16]:
result = rag_chain("what is the place of birth")
print(result)

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


The place of birth for:

1. Carlos Antonio Moreno Tuburquia (SDN Ref No 16409) is Monteria, Cordoba, Colombia.
2. Bilal Mansur Al-Hiyari (SDGT, SDN Ref No 8884) is al-Salt, Jordan.
