In [1]:
!pip install boto3 tqdm --quiet
import requests
import pandas as pd
import os
import json
import boto3
import time
from datetime import datetime, timedelta
from botocore.exceptions import NoCredentialsError



[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.0/14.0 MB[0m [31m103.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# --------------------------
# 1. Setup directories
# --------------------------
RAW_DIR = "staging/raw/api/"
CSV_FILE = "staging/processed/covid_pakistan.csv"

os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(os.path.dirname(CSV_FILE), exist_ok=True)


In [None]:

# --------------------------
# 2. AWS S3 Setup
# --------------------------
AWS_ACCESS_KEY = ""
AWS_SECRET_KEY = ""
REGION = ""
BUCKET_NAME = "dataingestionandstoring"

s3 = boto3.client(
    "s3",
    aws_access_key_id=AWS_ACCESS_KEY,
    aws_secret_access_key=AWS_SECRET_KEY,
    region_name=REGION
)


In [4]:
# --------------------------
# 3. Function to fetch & save one day's data
# --------------------------
def fetch_and_save(date_str):
    url = f"https://covid-api.com/api/reports?date={date_str}&iso=PAK&region_name=Pakistan"
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()

        # Save raw JSON
        raw_file = os.path.join(RAW_DIR, f"pakistan_{date_str}.json")
        with open(raw_file, "w") as f:
            json.dump(data, f, indent=2)

        # Flatten JSON → DataFrame
        if "data" in data and data["data"]:
            df = pd.json_normalize(data["data"])
            df["report_date"] = date_str
            return df
    except Exception as e:
        print(f"❌ Failed for {date_str}: {e}")
    return pd.DataFrame()

In [5]:
# --------------------------
# 4. Historical load (2020 → today)
# --------------------------
def initial_load():
    start_date = datetime(2020, 1, 1)
    end_date = datetime.utcnow()
    all_dfs = []

    current_date = start_date
    while current_date <= end_date:
        date_str = current_date.strftime("%Y-%m-%d")
        df = fetch_and_save(date_str)
        if not df.empty:
            all_dfs.append(df)
            print(f"✅ Got data for {date_str}")
        current_date += timedelta(days=1)

    if all_dfs:
        final_df = pd.concat(all_dfs, ignore_index=True)
        final_df.to_csv(CSV_FILE, index=False)
        print(f"\n✅ Initial CSV saved: {CSV_FILE} with {len(final_df)} rows")
        upload_to_s3(CSV_FILE)
    else:
        print("\n⚠️ No data fetched in initial load!")


In [10]:
# --------------------------
# 5. Upload to S3
# --------------------------
def upload_to_s3(local_file, s3_filename="covid_pakistan.csv"):
    """
    Uploads a file to S3 inside the raw/ folder of your bucket.
    Example: s3://dataingestionandstoring/raw/covid_pakistan.csv
    """
    try:
        s3_key = f"raw/{s3_filename}"   # upload into raw/ folder
        s3.upload_file(local_file, BUCKET_NAME, s3_key)
        print(f"☁️ Uploaded to s3://{BUCKET_NAME}/{s3_key}")
    except FileNotFoundError:
        print("❌ Local file not found:", local_file)
    except NoCredentialsError:
        print("❌ AWS credentials not available")


In [7]:

# --------------------------
# 6. Incremental updates (every 1 min)
# --------------------------
def incremental_updates():
    while True:
        today = datetime.utcnow().strftime("%Y-%m-%d")
        print(f"\n🔄 Checking API for {today} at {datetime.utcnow()}")

        df_new = fetch_and_save(today)
        if not df_new.empty:
            if os.path.exists(CSV_FILE):
                df_old = pd.read_csv(CSV_FILE)
                combined = pd.concat([df_old, df_new], ignore_index=True)
                combined = combined.drop_duplicates().reset_index(drop=True)
            else:
                combined = df_new

            combined.to_csv(CSV_FILE, index=False)
            print(f"✅ Updated CSV with {len(combined)} rows")

            upload_to_s3(CSV_FILE)
        else:
            print("ℹ️ No new data available")

        time.sleep(60)  # wait 1 min


In [11]:
upload_to_s3(CSV_FILE)

☁️ Uploaded to s3://dataingestionandstoring/raw/covid_pakistan.csv


In [None]:
incremental_updates()

  today = datetime.utcnow().strftime("%Y-%m-%d")
  print(f"\n🔄 Checking API for {today} at {datetime.utcnow()}")



🔄 Checking API for 2025-09-20 at 2025-09-20 21:28:04.893686
ℹ️ No new data available
