## Imports

In [None]:
import pandas as pd
import os
from dotenv import load_dotenv
import psycopg
from supabase import create_client, Client

## NDMA

In [2]:
ndma_csv_df = pd.read_csv("../Scrapers/ndma_advisories_bulk.csv")
data_ndma = []

for _, row in ndma_csv_df.iterrows():
    source = row["source_agency"]
    posted_date = row["date"]
    title = row["title"]
    url = row["url"]
    filename, filetype = os.path.splitext(row["filename"])
    filetype = filetype.lstrip('.')

    data_ndma.append({
        "source": source,
        "posted_date": posted_date,
        "title": title,
        "url": url,
        "filename": filename,
        "filetype": filetype
    })

ndma_df = pd.DataFrame(data_ndma)

## NEOC

In [3]:
neoc_csv_df = pd.read_csv("../Scrapers/ndma_neoc_projections_bulk.csv")
data_neoc = []

for _, row in neoc_csv_df.iterrows():
    source = row["source_agency"].split("-")[1]
    posted_date = row["date"]
    title = row["title"]
    url = row["url"]
    filename, filetype = os.path.splitext(row["filename"])
    filetype = filetype.lstrip('.')

    data_neoc.append({
        "source": source,
        "posted_date": posted_date,
        "title": title,
        "url": url,
        "filename": filename,
        "filetype": filetype
    })

neoc_df = pd.DataFrame(data_neoc)

## Combination

In [8]:
documents = pd.concat([ndma_df, neoc_df])
documents['posted_date'] = pd.to_datetime(documents['posted_date'], dayfirst=True)
documents = documents.sort_values(by='posted_date', ascending=False).reset_index(drop=True)
documents.index += 1

print(len(documents))
print(documents.head())

507
  source posted_date                                    title  \
1   NDMA  2025-10-30  Depression Over Eastcentral Arabian Sea   
2   NDMA  2025-10-29  Depression over Eastcentral Arabian Sea   
3   NEOC  2025-10-28              Smog advisory November 2025   
4   NDMA  2025-10-28  Depression Over Eastcentral Arabian Sea   
5   NDMA  2025-10-27    Depression Over Southeast Arabian Sea   

                                                 url              filename  \
1  https://www.ndma.gov.pk/storage/advisories/Oct...  GunJIH11BbS0vuehIHZa   
2  https://www.ndma.gov.pk/storage/advisories/Oct...  EzVU14xQb58TstnYb0yV   
3  https://www.ndma.gov.pk/storage/projection-imp...  InXkmGJqQbCXx7aRyjXA   
4  https://www.ndma.gov.pk/storage/advisories/Oct...  OQHWo7p3Rv9xxbbgZ9BM   
5  https://www.ndma.gov.pk/storage/advisories/Oct...  mgmlxxEXovz0ajEhV77A   

  filetype  
1      pdf  
2      pdf  
3      pdf  
4      pdf  
5      pdf  


In [9]:
documents.to_csv("../Scrapers/ndma_neoc_combined_bulk.csv", index_label="id")

## Upload

### Connection

In [None]:
load_dotenv('../local.env', override=True)

SUPABASE_URL = os.getenv("SUPABASE_URL")
SUPABASE_KEY = os.getenv("SUPABASE_KEY")
SUPABASE_DB_URL = os.getenv("SUPABASE_DB_URL")

supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
conn = psycopg.connect(SUPABASE_DB_URL)
cur = conn.cursor()
print("Connected to Supabase")

### Upload

In [None]:
INSERT_QUERY = """
    INSERT INTO documents (source, posted_date, title, url, filename, filetype)
    VALUES (%s, %s, %s, %s, %s, %s)
    ON CONFLICT(id) DO NOTHING
"""

values = [
    (
        row["source"],
        row["posted_date"],
        row["title"],
        row["url"],
        row["filename"],
        row["filetype"],
    )
    for _, row in documents.iterrows()
]

print(f"Inserting {len(values)} records...")
cur.executemany(INSERT_QUERY, values)
conn.commit()

print("Insertion successful")

cur.close()
conn.close()