## Imports

In [1]:
import pandas as pd
import os
from dotenv import load_dotenv
import psycopg
from supabase import create_client, Client
from urllib.parse import unquote

## NDMA

In [2]:
ndma_csv_df = pd.read_csv("../Scrapers/ndma_advisories_bulk.csv")
data_ndma = []

for _, row in ndma_csv_df.iterrows():
    source = row["source_agency"]
    posted_date = row["date"]
    title = row["title"]
    url = row["url"]
    filename, filetype = os.path.splitext(os.path.basename(unquote(row["filename"].split("file=")[1])))
    filetype = filetype.lstrip('.')

    data_ndma.append({
        "source": source,
        "posted_date": posted_date,
        "title": title,
        "url": url,
        "filename": filename,
        "filetype": filetype
    })

ndma_df = pd.DataFrame(data_ndma)
ndma_df.head(1)

Unnamed: 0,source,posted_date,title,url,filename,filetype
0,NDMA,03-11-2025,Rain-Wind / Thunderstorm (with snowfall over m...,https://www.ndma.gov.pk/secure-viewer?file=%2F...,8io64Uj5383VJ1tv9ZcW,pdf


## NEOC

In [3]:
neoc_csv_df = pd.read_csv("../Scrapers/ndma_neoc_projections_bulk.csv")
data_neoc = []

for _, row in neoc_csv_df.iterrows():
    source = row["source_agency"].split("-")[1]
    posted_date = row["date"]
    title = row["title"]
    url = row["url"]
    #filename, filetype = os.path.splitext(row["filename"])
    filename, filetype = os.path.splitext(os.path.basename(unquote(row["filename"].split("?file=")[-1])))
    filetype = filetype.lstrip('.')

    data_neoc.append({
        "source": source,
        "posted_date": posted_date,
        "title": title,
        "url": url,
        "filename": filename,
        "filetype": filetype
    })

neoc_df = pd.DataFrame(data_neoc)
neoc_df.head(1)

Unnamed: 0,source,posted_date,title,url,filename,filetype
0,NEOC,03-11-2025,"Weather Advisory 3rd to 8th Nov, 2025",https://www.ndma.gov.pk/secure-viewer?file=%2F...,AOejghW4I10mJs48wUWw,pdf


## Combination

In [4]:
documents = pd.concat([ndma_df, neoc_df])
documents['posted_date'] = pd.to_datetime(documents['posted_date'], dayfirst=True)
documents = documents.sort_values(by='posted_date', ascending=False).reset_index(drop=True)
documents['posted_date'] = documents['posted_date'].dt.strftime('%Y-%m-%d')
documents.index += 1

print(len(documents))
print(documents.head())

511
  source posted_date                                              title  \
1   NDMA  2025-11-03  Rain-Wind / Thunderstorm (with snowfall over m...   
2   NEOC  2025-11-03              Weather Advisory 3rd to 8th Nov, 2025   
3   NDMA  2025-10-31            Depression over Eastcentral Arabian Sea   
4   NDMA  2025-10-31                                      Drought Watch   
5   NDMA  2025-10-30            Depression Over Eastcentral Arabian Sea   

                                                 url              filename  \
1  https://www.ndma.gov.pk/secure-viewer?file=%2F...  8io64Uj5383VJ1tv9ZcW   
2  https://www.ndma.gov.pk/secure-viewer?file=%2F...  AOejghW4I10mJs48wUWw   
3  https://www.ndma.gov.pk/secure-viewer?file=%2F...  Q2YLBuxvLxLO4GDha2wQ   
4  https://www.ndma.gov.pk/secure-viewer?file=%2F...  K6y19XCyAM7nXz8wVUyL   
5  https://www.ndma.gov.pk/secure-viewer?file=%2F...  GunJIH11BbS0vuehIHZa   

  filetype  
1      pdf  
2      pdf  
3      pdf  
4      pdf  
5      pdf 

In [16]:
documents.to_csv("../Scrapers/ndma_neoc_combined_bulk.csv", index_label="id")

## Upload

### Psycopg

#### Connection

In [None]:
load_dotenv('../local.env', override=True)

SUPABASE_URL = os.getenv("SUPABASE_URL")
SUPABASE_KEY = os.getenv("SUPABASE_KEY")
SUPABASE_DB_URL = os.getenv("SUPABASE_DB_URL")

supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
conn = psycopg.connect(SUPABASE_DB_URL)
cur = conn.cursor()
print("Connected to Supabase")

#### Upload

In [None]:
INSERT_QUERY = """
    INSERT INTO documents (source, posted_date, title, url, filename, filetype)
    VALUES (%s, %s, %s, %s, %s, %s)
    ON CONFLICT(id) DO NOTHING
"""

values = [
    (
        row["source"],
        row["posted_date"],
        row["title"],
        row["url"],
        row["filename"],
        row["filetype"],
    )
    for _, row in documents.iterrows()
]

print(f"Inserting {len(values)} records...")
cur.executemany(INSERT_QUERY, values)
conn.commit()

print("Insertion successful")

cur.close()
conn.close()

### Supabase

In [8]:
load_dotenv('../local.env', override=True)

SUPABASE_URL = os.getenv("SUPABASE_URL")
SUPABASE_SERVICE_KEY  = os.getenv("SUPABASE_SERVICE_KEY")

supabase: Client = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY)

In [9]:
doc_records = documents.to_dict('records')

# Upsert data (insert new rows, update existing based on url)
response = supabase.table('documents').upsert(
    doc_records,
    on_conflict='filename'
).execute()

print(f"Upserted {len(response.data)} rows")

Upserted 511 rows
