In [None]:
import requests
from bs4 import BeautifulSoup
import os
from concurrent.futures import ThreadPoolExecutor

url = "https://www.ncei.noaa.gov/data/oceans/argo/gadr/data/indian/2019/01/"
folder = "./data"
os.makedirs(folder, exist_ok=True)

# Get list of .nc files
r = requests.get(url)
from bs4 import BeautifulSoup
soup = BeautifulSoup(r.text, "html.parser")
files = [node.get("href") for node in soup.find_all("a") if node.get("href").endswith(".nc")]

def download_file(f):
    r_file = requests.get(url + f)
    with open(os.path.join(folder, f), "wb") as file:
        file.write(r_file.content)
    print("Downloaded:", f)

# Download in parallel using 5 threads
with ThreadPoolExecutor(max_workers=5) as executor:
    executor.map(download_file, files)

print("Done")


In [None]:
import xarray as xr
import pandas as pd
import glob
import os

input_folder = r"C:\Users\shubh\Downloads\Classroom\Hackathons\SIH 2025\ARGO\Test\data"
output_folder = r"C:\Users\shubh\Downloads\Classroom\Hackathons\SIH 2025\ARGO\Test\processed"
os.makedirs(output_folder, exist_ok=True)
output_file = os.path.join(output_folder, "argo_indian_2019_01.parquet")

def process_nc(file_path):
    ds = xr.open_dataset(file_path)
    dfs = []

    n_levels = ds.sizes['n_levels']
    n_prof = ds.sizes['n_prof']

    for p in range(n_prof):
        # Get profile-level info
        lat = float(ds['latitude'].values[p])
        lon = float(ds['longitude'].values[p])
        juld = pd.Timestamp(ds['juld'].values[p])
        float_id = str(ds['platform_number'].values[p])
        cycle = int(ds['cycle_number'].values[p])
        
        # Profile data per level
        pres = ds['pres'].values[p, :].flatten()
        temp = ds['temp'].values[p, :].flatten()
        psal = ds['psal'].values[p, :].flatten()

        df = pd.DataFrame({
            "float_id": [float_id]*n_levels,
            "cycle": [cycle]*n_levels,
            "latitude": [lat]*n_levels,
            "longitude": [lon]*n_levels,
            "time": [juld]*n_levels,
            "pressure": pres,
            "temperature": temp,
            "salinity": psal
        })
        dfs.append(df)
    
    return pd.concat(dfs, ignore_index=True)

# ----------------------------
# Process all files
# ----------------------------
all_files = glob.glob(os.path.join(input_folder, "*.nc"))
all_dfs = []

print(f"Found {len(all_files)} NetCDF files. Processing...")

for f in all_files:
    try:
        df = process_nc(f)
        all_dfs.append(df)
        print(f"Processed: {os.path.basename(f)}")
    except Exception as e:
        print(f"Error processing {f}: {e}")

if all_dfs:
    argo_df = pd.concat(all_dfs, ignore_index=True)
    argo_df.to_parquet(output_file, index=False)
    print(f"Done! Combined data saved to: {output_file}")
else:
    print("No data processed. Check your files.")


In [1]:
import pandas as pd

df = pd.read_parquet(r"C:\Users\shubh\Downloads\Classroom\Hackathons\SIH 2025\ARGO\Test\processed\argo_indian_2019_01.parquet")
print(df.head())


      float_id  cycle  latitude  longitude                          time  \
0  b'1900975 '    339   -48.136    128.389 2019-01-06 19:10:56.002052864   
1  b'1900975 '    339   -48.136    128.389 2019-01-06 19:10:56.002052864   
2  b'1900975 '    339   -48.136    128.389 2019-01-06 19:10:56.002052864   
3  b'1900975 '    339   -48.136    128.389 2019-01-06 19:10:56.002052864   
4  b'1900975 '    339   -48.136    128.389 2019-01-06 19:10:56.002052864   

   pressure  temperature   salinity  
0       4.2        9.198  34.480000  
1      10.2        9.192  34.478001  
2      17.4        9.206  34.481998  
3      24.6        9.211  34.483002  
4      31.1        9.194  34.478001  


In [2]:
len(df)

1374813

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1374813 entries, 0 to 1374812
Data columns (total 8 columns):
 #   Column       Non-Null Count    Dtype         
---  ------       --------------    -----         
 0   float_id     1374813 non-null  object        
 1   cycle        1374813 non-null  int64         
 2   latitude     1374813 non-null  float64       
 3   longitude    1374813 non-null  float64       
 4   time         1374813 non-null  datetime64[ns]
 5   pressure     1089903 non-null  float32       
 6   temperature  1085000 non-null  float32       
 7   salinity     1084049 non-null  float32       
dtypes: datetime64[ns](1), float32(3), float64(2), int64(1), object(1)
memory usage: 68.2+ MB


In [4]:
df.describe()

Unnamed: 0,cycle,latitude,longitude,time,pressure,temperature,salinity
count,1374813.0,1374813.0,1374813.0,1374813,1089903.0,1085000.0,1084049.0
mean,116.4047,-30.15141,92.98543,2019-01-16 12:07:59.426492928,811.0287,8.489975,34.59978
min,1.0,-64.434,20.5816,2019-01-01 00:41:32.640000,-3171.2,-1.709,-4.01
25%,80.0,-42.44269,70.108,2019-01-08 22:04:32.999994368,240.0,3.298,34.5762
50%,104.0,-31.8438,95.36275,2019-01-16 11:50:15.999989248,724.0,6.586,34.708
75%,157.0,-18.219,113.849,2019-01-24 01:18:39.168000,1312.0,11.386,34.964
max,492.0,18.601,144.876,2019-01-31 23:46:43.000017920,5692.4,59.253,53.042
std,67.82093,17.32301,28.65404,,610.3844,6.417424,2.13145


In [5]:
df.isnull().sum()

float_id            0
cycle               0
latitude            0
longitude           0
time                0
pressure       284910
temperature    289813
salinity       290764
dtype: int64

In [6]:
import pandas as pd
import numpy as np

# assume df already loaded from parquet (like you showed)
# df columns: float_id, cycle, latitude, longitude, time, pressure, temperature, salinity

# clean float_id bytes-looking strings
df['float_id'] = df['float_id'].astype(str).str.strip().str.strip("b'").str.strip("'")

# Create profile-level summary doc
grouped = df.groupby(['float_id', 'cycle'], as_index=False)

def make_profile_doc(g):
    n_levels = len(g)
    lat = float(g['latitude'].iloc[0])
    lon = float(g['longitude'].iloc[0])
    time = pd.to_datetime(g['time'].iloc[0])
    p_min = float(g['pressure'].min()) if 'pressure' in g and g['pressure'].notnull().any() else None
    p_max = float(g['pressure'].max()) if 'pressure' in g and g['pressure'].notnull().any() else None
    t_min = float(g['temperature'].min()) if 'temperature' in g and g['temperature'].notnull().any() else None
    t_max = float(g['temperature'].max()) if 'temperature' in g and g['temperature'].notnull().any() else None
    s_min = float(g['salinity'].min()) if 'salinity' in g and g['salinity'].notnull().any() else None
    s_max = float(g['salinity'].max()) if 'salinity' in g and g['salinity'].notnull().any() else None
    # textual summary: short, factual
    summary = (
        f"Float {g['float_id'].iloc[0]} cycle {g['cycle'].iloc[0]} at ({lat:.3f},{lon:.3f}) "
        f"on {time.date()} with {n_levels} levels; p_range={p_min}-{p_max}, "
        f"temp={t_min}-{t_max}, sal={s_min}-{s_max}."
    )
    return {
        "float_id": g['float_id'].iloc[0],
        "cycle": int(g['cycle'].iloc[0]),
        "latitude": lat,
        "longitude": lon,
        "time": pd.to_datetime(time),
        "n_levels": int(n_levels),
        "p_min": p_min,
        "p_max": p_max,
        "t_min": t_min,
        "t_max": t_max,
        "s_min": s_min,
        "s_max": s_max,
        "summary": summary
    }

# Build docs list (this is memory-sensitive for huge data — see scaling below)
docs = []
for (fid, cyc), grp in df.groupby(['float_id', 'cycle']):
    docs.append(make_profile_doc(grp))

meta_df = pd.DataFrame(docs)
meta_df.to_parquet("argo_profiles_metadata.parquet", index=False)
print("Wrote argo_profiles_metadata.parquet, entries:", len(meta_df))


Wrote argo_profiles_metadata.parquet, entries: 1459


In [7]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pandas as pd
import json
import os

meta_df = pd.read_parquet("argo_profiles_metadata.parquet")

# choose embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")  # small & fast

# prepare texts (summary or combined fields)
texts = meta_df['summary'].astype(str).tolist()

# compute embeddings in batches
batch_size = 256
embs = []
for i in range(0, len(texts), batch_size):
    emb_batch = model.encode(texts[i:i+batch_size], show_progress_bar=False, convert_to_numpy=True)
    embs.append(emb_batch)
embs = np.vstack(embs).astype('float32')  # FAISS uses float32

# Build FAISS index (L2 or inner product; we'll normalize and use inner product)
d = embs.shape[1]
index = faiss.IndexFlatIP(d)  # inner product
# normalize vectors to unit length for cosine similarity with IP
faiss.normalize_L2(embs)
index.add(embs)

# persist index and metadata easily
faiss.write_index(index, "faiss_argo_profiles.index")
meta_df.to_parquet("argo_profiles_metadata_with_index.parquet", index=False)
# also save original summaries in a jsonl for quick lookup if desired
meta_df.to_json("argo_profiles_metadata.jsonl", orient="records", lines=True)
print("Saved FAISS index and metadata. vectors:", embs.shape)



Saved FAISS index and metadata. vectors: (1459, 384)


In [8]:
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

index = faiss.read_index("faiss_argo_profiles.index")
meta_df = pd.read_parquet("argo_profiles_metadata_with_index.parquet")
model = SentenceTransformer("all-MiniLM-L6-v2")

def retrieve(query, top_k=5):
    q_emb = model.encode([query], convert_to_numpy=True).astype('float32')
    faiss.normalize_L2(q_emb)
    D, I = index.search(q_emb, top_k)
    results = []
    for idx, score in zip(I[0], D[0]):
        rec = meta_df.iloc[idx].to_dict()
        rec['score'] = float(score)
        results.append(rec)
    return results
# example query
query = "Find profiles with temperature above 20 degrees and salinity below 35"
results = retrieve(query, top_k=5)
for r in results:
    print(r)


{'float_id': '3901958 ', 'cycle': 40, 'latitude': -58.18794682152788, 'longitude': 46.62004344838868, 'time': Timestamp('2019-01-30 21:00:30'), 'n_levels': 1208, 'p_min': 0.20000000298023224, 'p_max': 2025.699951171875, 't_min': -0.7950000166893005, 't_max': 2.0309998989105225, 's_min': 33.78900146484375, 's_max': 34.74100112915039, 'summary': 'Float 3901958  cycle 40 at (-58.188,46.620) on 2019-01-30 with 1208 levels; p_range=0.20000000298023224-2025.699951171875, temp=-0.7950000166893005-2.0309998989105225, sal=33.78900146484375-34.74100112915039.', 'score': 0.4799206852912903}
{'float_id': '3901649 ', 'cycle': 37, 'latitude': -44.489, 'longitude': 25.753, 'time': Timestamp('2019-01-17 05:54:00'), 'n_levels': 192, 'p_min': 0.0, 'p_max': 1985.0, 't_min': 2.5280001163482666, 't_max': 8.894000053405762, 's_min': 33.81999969482422, 's_max': 34.755001068115234, 'summary': 'Float 3901649  cycle 37 at (-44.489,25.753) on 2019-01-17 with 192 levels; p_range=0.0-1985.0, temp=2.528000116348266

In [9]:
from openai import OpenAI
import json
from dotenv import load_dotenv
load_dotenv()
import os

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

system_prompt = """
You are an assistant that extracts a structured query object (JSON) from a user's request.
Output only valid JSON matching this schema:
{
  "intent": "<one of: filter_profiles, compare_regions, nearest_floats, summary>",
  "filters": {
    "lat_range": [min_lat, max_lat] or null,
    "lon_range": [min_lon, max_lon] or null,
    "start_date": "YYYY-MM-DD" or null,
    "end_date": "YYYY-MM-DD" or null,
    "variables": ["temperature","salinity","pressure","oxygen"] or null,
    "depth_range": [min_m, max_m] or null,
    "float_ids": ["1900975", ...] or null
  },
  "visualization": "<one of: profile_plot, map, timeseries, table>",
  "k": 50
}
"""

user_query = "Show salinity profiles near the equator in March 2023"

response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_query},
    ]
)

# Extract JSON string
mcp_json_str = response.choices[0].message.content
mcp = json.loads(mcp_json_str)
print(mcp)


{'intent': 'filter_profiles', 'filters': {'lat_range': [-2.0, 2.0], 'lon_range': None, 'start_date': '2023-03-01', 'end_date': '2023-03-31', 'variables': ['salinity'], 'depth_range': None, 'float_ids': None}, 'visualization': 'profile_plot', 'k': 50}


In [12]:
import pandas as pd
import pyarrow.parquet as pq
import json
from datetime import datetime

# def execute_structured_query(qobj, parquet_path="argo_indian_2019_01.parquet"):
def execute_structured_query(qobj, parquet_path=r"C:\Users\shubh\Downloads\Classroom\Hackathons\SIH 2025\ARGO\Test\argo_profiles_metadata_with_index.parquet"):
    # For efficiency, load only metadata or use pyarrow dataset scanner + filters
    # We'll show an in-memory example for clarity: read profile metadata, filter, then read full levels as needed.
    meta = pd.read_parquet("argo_profiles_metadata_with_index.parquet")
    f = qobj.get("filters", {})
    # apply lat/lon filters
    if f.get("lat_range"):
        minlat, maxlat = f["lat_range"]
        meta = meta[(meta.latitude >= minlat) & (meta.latitude <= maxlat)]
    if f.get("lon_range"):
        minlon, maxlon = f["lon_range"]
        meta = meta[(meta.longitude >= minlon) & (meta.longitude <= maxlon)]
    if f.get("start_date"):
        start = pd.to_datetime(f["start_date"])
        meta = meta[meta.time >= start]
    if f.get("end_date"):
        end = pd.to_datetime(f["end_date"])
        meta = meta[meta.time <= end]
    if f.get("float_ids"):
        meta = meta[meta.float_id.isin(f["float_ids"])]
    # limit number of profiles
    k = qobj.get("k", 50)
    sel = meta.sort_values("time").head(k)
    # now read the raw levels for these profiles from the full parquet (df)
    full = pd.read_parquet(parquet_path)
    merged = full.merge(sel[['float_id','cycle']], on=['float_id','cycle'], how='inner')
    # apply depth filter to pressure if provided
    if f.get("depth_range") and 'pressure' in merged.columns:
        pmin, pmax = f['depth_range']
        merged = merged[(merged.pressure >= pmin) & (merged.pressure <= pmax)]
    return merged
# example usage
qobj = {
  "intent": "filter_profiles",
  "filters": {
    "lat_range": [-5, 5],
    "lon_range": [30, 60],
    "start_date": "2019-03-01",
    "end_date": "2019-03-31",
    "variables": ["salinity"],
    "depth_range": [0, 200],
    "float_ids": None
  },
  "visualization": "profile_plot",
  "k": 10
}
df = execute_structured_query(qobj)
print(df.head())

Empty DataFrame
Columns: [float_id, cycle, latitude, longitude, time, n_levels, p_min, p_max, t_min, t_max, s_min, s_max, summary]
Index: []


In [14]:
# Using OpenAI (pseudo)
from openai import OpenAI
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def llm_get_structured_json(prompt_text):
    resp = client.chat.create(model="gpt-4o-mini", messages=[{"role":"system","content":"...instructions..."},
                                                           {"role":"user","content":prompt_text}])
    text = resp.choices[0].message.content
    # If the model returned extra text, try to extract JSON
    import re, json
    m = re.search(r'\{.*\}$', text, re.S)
    raw = m.group(0) if m else text
    return json.loads(raw)


In [15]:
# assume retrieve() defined earlier, and you have an LLM client function llm_get_structured_json(query_text)
# llm_get_structured_json calls your LLM and returns parsed json

def rag_query_and_run(user_query):
    # 1) retrieval to give context
    docs = retrieve(user_query, top_k=6)
    # combine top docs as context for LLM (short)
    context = "\n".join([d['summary'] for d in docs])
    # 2) build prompt with context + user query
    prompt = (
        f"Context documents:\n{context}\n\n"
        f"User request: {user_query}\n\n"
        "Extract a JSON structured query object (see schema). Output only JSON."
    )
    # 3) call LLM to get JSON (you can use OpenAI or local)
    qobj = llm_get_structured_json(prompt)  # you must implement this using your LLM provider
    # 4) execute structured query
    df_result = execute_structured_query(qobj, parquet_path="argo_indian_2019_01.parquet")
    return qobj, df_result, docs

# example usage
rag_query_and_run("Show salinity profiles near the equator in March 2023")

AttributeError: 'Chat' object has no attribute 'create'

In [16]:
import streamlit as st
import plotly.express as px

st.title("FloatChat — ARGO RAG Demo (Parquet-only)")

user_q = st.text_input("Ask about ARGO data (e.g., 'Show salinity near equator March 2019')")

if st.button("Run"):
    with st.spinner("Running RAG..."):
        qobj, df_res, docs = rag_query_and_run(user_q)
    st.subheader("Structured Query")
    st.json(qobj)
    if df_res.empty:
        st.info("No matching profiles found.")
    else:
        st.subheader("Sample rows")
        st.dataframe(df_res.head(200))

        # profile plot example: salinity vs pressure per profile
        if 'salinity' in df_res.columns:
            fig = px.line(df_res, x='salinity', y='pressure', color='float_id', line_group=['float_id','cycle'],
                          labels={'pressure':'Pressure (dbar)', 'salinity':'Salinity PSU'},
                          title='Salinity profiles (pressure increasing downward)')
            fig.update_yaxes(autorange='reversed')  # pressure downwards
            st.plotly_chart(fig, use_container_width=True)

        # map of profile locations
        map_df = df_res.groupby(['float_id','cycle','latitude','longitude']).size().reset_index()
        st.map(map_df.rename(columns={'latitude':'lat','longitude':'lon'}).loc[:,['lat','lon']])


2025-09-23 01:53:39.246 
  command:

    streamlit run c:\Users\shubh\Downloads\AI ML\venv\lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2025-09-23 01:53:39.251 Session state does not function when running a script without `streamlit run`
