In [4]:
# -----------------------------
# 1Ô∏è‚É£ Import libraries
# -----------------------------
import pandas as pd
from langchain_core.documents import Document
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.llms import Ollama
from langchain_core.prompts import PromptTemplate

# -----------------------------
# 2Ô∏è‚É£ Load CSV
# -----------------------------
csv_path = "../assets/filtered_data.csv"

print(f"üìÅ Loading CSV file: {csv_path}")
df = pd.read_csv(csv_path)

print(f"üìä Rows: {len(df)}, Columns: {len(df.columns)}")

# -----------------------------
# 3Ô∏è‚É£ Create Documents (handle NaN)
# -----------------------------
documents = []
for idx, row in df.iterrows():
    # Replace NaN with "N/A"
    values = [str(v) if pd.notna(v) else "N/A" for v in row.values]
    content = " | ".join(values)
    documents.append(
        Document(
            page_content=content,
            metadata={"row_id": idx}
        )
    )

print(f"üìö Created {len(documents)} documents")

# -----------------------------
# 4Ô∏è‚É£ Embedding model
# -----------------------------
embeddings = OllamaEmbeddings(model="nomic-embed-text")

# -----------------------------
# 5Ô∏è‚É£ Create FAISS vector store
# -----------------------------
vectorstore = FAISS.from_texts(
    texts=[doc.page_content for doc in documents],
    embedding=embeddings,
    metadatas=[doc.metadata for doc in documents]
)
print("‚úÖ Vector store created successfully")

# -----------------------------
# 6Ô∏è‚É£ Create Retriever
# -----------------------------
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

# -----------------------------
# 7Ô∏è‚É£ Setup LLM
# -----------------------------
llm = Ollama(model="llama3.2:1b")

# -----------------------------
# 8Ô∏è‚É£ Define Prompt Template
# -----------------------------
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are an assistant that answers questions using ONLY the CSV data.

CSV DATA:
{context}

Question:
{question}

If the answer is not in the CSV, say:
"I cannot find this information in the CSV."
"""
)

# -----------------------------
# 9Ô∏è‚É£ Define RAG function
# -----------------------------
def ask(question: str):
    # Use get_relevant_texts to avoid AttributeError
    docs = retriever(question)  # bu bir liste d√∂nd√ºr√ºyor
    context = "\n".join(d.page_content for d in docs)
    return llm.invoke(prompt.format(context=context, question=question))

# ------------------------


üìÅ Loading CSV file: ../assets/filtered_data.csv
üìä Rows: 2302, Columns: 9
üìö Created 2302 documents


  embeddings = OllamaEmbeddings(model="nomic-embed-text")


‚úÖ Vector store created successfully


  llm = Ollama(model="llama3.2:1b")


In [6]:
# Use the vectorstore directly
def ask(question: str):
    # FAISS object has a method: similarity_search
    docs = vectorstore.similarity_search(question, k=5)  # top 5 most similar docs
    context = "\n".join(d.page_content for d in docs)
    return llm.invoke(prompt.format(context=context, question=question))


In [7]:
# Test RAG system
answer = ask("What information is stored in this dataset?")
print(f"Answer:\n{answer}")


Answer:
Based on the provided CSV data, it appears that "information" can be broken down into several categories:

1. **Location**: The columns "Natural", "Sint Maarten (Dutch part)", "SXM", and "IDN" contain location names or codes for various geographic regions.
2. **Event/Catagory**: Columns like "Storm", "Volcanic activity", and "Flood" store information about different types of events or phenomena.
3. **Year/Date**: The columns "2017", "2018", "2022", and "2014" indicate the year or date for each occurrence.

These categories can be grouped into three main areas:

1. Geographic Information (Location)
2. Event/Catagory
3. Date/Time (Year or Date)


In [31]:
answer = ask("Which country suffered the most damage?")
print(f"Answer:\n{answer}")


Answer:
To determine which country suffered the most damage, I will calculate the total "Damage" for each country based on the given data.

First, I'll sum up all the values for "Natural" in the columns with unique countries:

- India: 226806 + 197798 = 424604
- Brazil: 197798 + 17897 = 216985
- China: 99168 + 1018663 = 1029921
 

Now, I'll sum up all the values for "Mass movement (wet)" in the columns with unique countries:

- India: 8569.902547 + 3529081.3 = 3604170.802847
- Brazil: 2824.715413 + 317756.5 = 306080.215913
- China: 1053.112314 + 8474922.7 = 8485766.923

Next, I'll compare the total "Damage" for each country to find out which one suffered the most damage:

India had the highest total damage at 424604 units.
Brazil and China also had a significant amount of damage but are tied as they have higher totals than India, with Brazil having 3604170.80 more units in damages than India.

Therefore, I cannot find this information in the CSV.


In [35]:
answer = ask("How much damage did Switzerland suffer?")
print(f"Answer:\n{answer}")


Answer:
To answer this question, we need to look for a column with 'Switzerland' or any reference to Switzerland.

Looking at the data:

- Natural | Glacial lake outburst flood | India | IND | 234 | 226806 | 2021 | 2238.127142 | N/A
- Mass movement (wet) | Indonesia | Sierra Leone | SLE | 1102 | 35818 | 2017 | 484.4561288 | 1120.0

There is no 'Switzerland' or any direct reference to Switzerland in the provided CSV data.

Therefore, I cannot find this information in the CSV.


In [36]:
answer = ask("Summarize the total damage for all countries.")
print(f"Answer:\n{answer}")

Answer:
SELECT SUM(Total) FROM Natural


In [9]:
# Create vector_db folder to store databases
import os
from pathlib import Path

vector_db_dir = "./vector_db"
Path(vector_db_dir).mkdir(exist_ok=True)
print(f"üìÅ Vector database folder: {vector_db_dir}")

# Alternative: Using Chroma instead of FAISS
from langchain_community.vectorstores import Chroma

# Create Chroma vector store with persistent storage
chroma_path = os.path.join(vector_db_dir, "chroma_db")
chroma_vectorstore = Chroma.from_texts(
    texts=[doc.page_content for doc in documents],
    embedding=embeddings,
    metadatas=[doc.metadata for doc in documents],
    persist_directory=chroma_path
)
print(f"‚úÖ Chroma vector store created at: {chroma_path}")

# Define RAG function using Chroma
def ask_with_chroma(question: str):
    docs = chroma_vectorstore.similarity_search(question, k=5)
    context = "\n".join(d.page_content for d in docs)
    return llm.invoke(prompt.format(context=context, question=question))

# Test with Chroma
answer_chroma = ask_with_chroma("What information is stored in this dataset?")
print(f"Answer (Chroma):\n{answer_chroma}")


üìÅ Vector database folder: ./vector_db
‚úÖ Chroma vector store created at: ./vector_db/chroma_db
Answer (Chroma):
From the provided CSV data:

- The first column 'Natural' likely represents a natural disaster type (e.g., Flood, Earthquake, Storm).
- The second column 'Flood' contains flood-related data.
- The third column 'Viet Nam' appears to be related to Vietnam or Southeast Asia.
- The fourth column 'VNM' is the Vietnamese acronym for Vietnam.
- The fifth column '1' indicates a single entry in each row.
- The sixth column '424' seems to hold some value, but its meaning isn't clear without additional context.
- The seventh column '2017' contains information about the year of data collection or the event's occurrence in 2017.
- The eighth column '2992.071532' likely holds a specific measurement related to events in Vietnam.
- The ninth column '229877.4' is also related to Vietnam and seems to hold some numerical value, possibly indicating area measurements.
- The tenth column '1' a

In [11]:
answer = ask_with_chroma("Which country suffered the most damage?")
print(f"Answer:\n{answer}")


Answer:
Let's analyze the data.

According to the CSV data, China suffered the most damage with a total mass movement of 1018663. This corresponds to the entry in Brazil, where the mass movement was 17897 (as given by the question).

Therefore, I can confidently answer that:
"I cannot find this information in the CSV."


In [12]:
answer = ask_with_chroma("What are the top 3 countries with the lowest total damage?")
print(f"Answer:\n{answer}")

Answer:
To find the top 3 countries with the lowest total damage, we need to calculate the total damage for each country and then rank them. 

Here are the calculations:

1. Natural | Earthquake | Japan | JPN | 
2. Natural | Earthquake | China | CHN |
3. Natural | Flood | Viet Nam | VNM |

Total Damage:
1. 752000 + 12720.21632 = 862220.21632
2. 42000 + 10359.14986 = 52459.14986
3. 0 + 3703.649059 = 3703.649059

Ranking by Total Damage:
1. Japan - 862220.21632
2. China - 52459.14986
3. Viet Nam - 3703.649059

Top 3 countries with the lowest total damage are: Japan, China, and Viet Nam.

However, I couldn't find any information about a drought in Marshall Islands. The CSV data doesn't include that event, so it's not included in our calculation.

Therefore, the answer is:
I cannot find this information in the CSV.
