Project: Nexus Phase 1 Ingest

In [7]:
!pip install requests pandas pyarrow fastparquet --quiet

import json, time, os
import requests
import pandas as pd
import numpy as np
from pyspark.sql import functions as F
from datetime import datetime, timedelta


StatementMeta(, 8c232123-2c0d-438f-8bd9-a6d57996b0fd, 9, Finished, Available, Finished)

In [8]:
# Choose a station ID (KSEA = Seattle-Tacoma Intl)
station_id = "KSEA"  
url = f"https://api.weather.gov/stations/{station_id}/observations?limit=100"  # small limit

resp = requests.get(url, headers={"User-Agent":"fabric-nexus-demo"})
resp.raise_for_status()
j = resp.json()
rows = []
for item in j.get("features", []):
    props = item.get("properties", {})
    rows.append({
        "source":"noaa",
        "station": station_id,
        "ts": props.get("timestamp"),
        "temp_C": props.get("temperature", {}).get("value"),
        "wind_speed_mps": props.get("windSpeed", {}).get("value"),
        "text_description": props.get("textDescription"),
    })

noaa_df = pd.DataFrame(rows)
noaa_df['ts'] = pd.to_datetime(noaa_df['ts'])
noaa_df.head()


StatementMeta(, 8c232123-2c0d-438f-8bd9-a6d57996b0fd, 10, Finished, Available, Finished)

Unnamed: 0,source,station,ts,temp_C,wind_speed_mps,text_description
0,noaa,KSEA,2025-09-18 11:15:00+00:00,13.0,0.0,Clear
1,noaa,KSEA,2025-09-18 11:10:00+00:00,13.0,0.0,Clear
2,noaa,KSEA,2025-09-18 11:05:00+00:00,13.0,0.0,Clear
3,noaa,KSEA,2025-09-18 11:00:00+00:00,13.0,5.544,Clear
4,noaa,KSEA,2025-09-18 10:55:00+00:00,13.0,5.544,Clear


In [9]:
owner = "AnswerDotAI"  # example owner
repo = "ghapi"       # example repo replace with a repo you prefer
per_page = 100
page = 1

events = []
for p in range(1, 4):  # 3 pages * 100 = 300 events max 
    url = f"https://api.github.com/repos/{owner}/{repo}/events?per_page={per_page}&page={p}"
    r = requests.get(url, headers={"User-Agent":"fabric-nexus-demo"})
    if r.status_code != 200:
        break
    page_events = r.json()
    if not page_events:
        break
    for e in page_events:
        events.append({
            "source":"github",
            "repo": f"{owner}/{repo}",
            "event_type": e.get("type"),
            "actor": e.get("actor", {}).get("login"),
            "created_at": e.get("created_at"),
            "payload": json.dumps(e.get("payload", {}))[:2000]  # truncate payload for demo
        })
    time.sleep(0.2) 
github_df = pd.DataFrame(events)
github_df['created_at'] = pd.to_datetime(github_df['created_at'])
github_df.head()


StatementMeta(, 8c232123-2c0d-438f-8bd9-a6d57996b0fd, 11, Finished, Available, Finished)

Unnamed: 0,source,repo,event_type,actor,created_at,payload
0,github,AnswerDotAI/ghapi,IssueCommentEvent,supakeen,2025-09-17 07:56:29+00:00,"{""action"": ""created"", ""issue"": {""url"": ""https:..."
1,github,AnswerDotAI/ghapi,IssuesEvent,supakeen,2025-09-17 07:56:28+00:00,"{""action"": ""closed"", ""issue"": {""url"": ""https:/..."
2,github,AnswerDotAI/ghapi,IssueCommentEvent,supakeen,2025-09-17 07:55:49+00:00,"{""action"": ""created"", ""issue"": {""url"": ""https:..."
3,github,AnswerDotAI/ghapi,PullRequestEvent,supakeen,2025-09-17 07:55:49+00:00,"{""action"": ""closed"", ""number"": 198, ""pull_requ..."
4,github,AnswerDotAI/ghapi,IssueCommentEvent,KeremTurgutlu,2025-09-17 06:05:25+00:00,"{""action"": ""created"", ""issue"": {""url"": ""https:..."


In [10]:
noaa_df = noaa_df.rename(columns={"ts":"event_ts"})
noaa_df['ingest_ts'] = pd.Timestamp.utcnow()
github_df = github_df.rename(columns={"created_at":"event_ts"})
github_df['ingest_ts'] = pd.Timestamp.utcnow()

# small sample up to 2000 rows total
events_df = pd.concat([noaa_df, github_df], ignore_index=True, sort=False)
events_df = events_df.sample(n=min(len(events_df), 2000), random_state=42).reset_index(drop=True)
events_df.shape

StatementMeta(, 8c232123-2c0d-438f-8bd9-a6d57996b0fd, 12, Finished, Available, Finished)

(139, 11)

In [11]:
spark_df = spark.createDataFrame(events_df)
spark_df = spark_df.withColumn("ingest_ts", F.current_timestamp())
spark_df.write.format("delta").mode("overwrite").saveAsTable("raw_events_demo")

StatementMeta(, 8c232123-2c0d-438f-8bd9-a6d57996b0fd, 13, Finished, Available, Finished)