In [70]:
import requests
import pandas as pd
import os
from datetime import datetime, timedelta
from tqdm.notebook import tqdm  # Use tqdm from the notebook module
import asyncio
import aiohttp
import re
import calendar
import pandas as pd


In [71]:
def check_missing_data(csv_file_path, year, month):
    # Load the CSV into a DataFrame
    df = pd.read_csv(csv_file_path)
    month = int(month)

    # Convert the 'Timestamp' column to datetime
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])

    # Define date bounds
    lower_bound = pd.to_datetime("2020-04-20").date()
    upper_bound = pd.to_datetime("2025-02-17").date()

    # Generate full date range for the month
    _, num_days = calendar.monthrange(year, month)
    full_range = pd.date_range(start=f"{year}-{month:02d}-01", 
                               end=f"{year}-{month:02d}-{num_days}", 
                               freq='D').date

    # Filter the range within bounds
    full_range = [d for d in full_range if lower_bound <= d <= upper_bound]

    # Create a new column for the date part of the timestamp
    df['Date'] = df['Timestamp'].dt.date

    # Group by the date and count the number of rows per day
    daily_counts = df.groupby('Date').size()

    # Identify days with fewer than 288 entries
    partial_days = daily_counts[daily_counts < 288].index.tolist()

    # Identify days completely missing
    missing_days = sorted(set(full_range) - set(daily_counts.index))

    # Combine and return all missing or partial days
    return sorted(partial_days + missing_days)


In [72]:
check_missing_data(r"D:\Documents\Term 8\Deep Learning\Lightning_Risk_Prediction\data\data_gov_sg\rainfall_data\rainfall_2020-06.csv", 2020, "06")

[datetime.date(2020, 6, 8),
 datetime.date(2020, 6, 9),
 datetime.date(2020, 6, 10),
 datetime.date(2020, 6, 11)]

In [73]:
async def get_data_async(date_time, data_set_name, session):
    base_url = f"https://api-open.data.gov.sg/v2/real-time/api/{data_set_name}?date={date_time}"
    all_data = []

    while True:
        # Asynchronously fetch the data
        async with session.get(base_url) as response:
            data = await response.json()
            readings = data.get('data', {}).get('readings', [])

            if readings:
                for reading in readings:
                    reading_data = reading.get('data', [])
                    for entry in reading_data:
                        entry['Timestamp'] = reading.get('timestamp')  # Add Timestamp for each entry
                        all_data.append(entry)
            else:
                break  # No readings, exit loop

            # Check for paginationToken and update URL for next request
            pagination_token = data.get('data', {}).get('paginationToken')
            if pagination_token:
                base_url = f"https://api-open.data.gov.sg/v2/real-time/api/{data_set_name}?date={date_time}&paginationToken={pagination_token}"
            else:
                break  # No more pages

    # Convert the collected data into a DataFrame
    df = pd.DataFrame(all_data)

    # Pivot table if data is available
    if not df.empty:
        pivot_table = df.pivot_table(index='Timestamp', columns='stationId', values='value', aggfunc='first')
        return pivot_table
    else:
        return pd.DataFrame()  # Return empty DataFrame if no data

In [None]:
async def check_missing_month(data_set_name, year, month):
    # Format the CSV file path properly
    csv_file_path = f"../data/data_gov_sg/{data_set_name}_data/{data_set_name}_{year}-{month}.csv"

    # Check for missing data days
    missing_days = check_missing_data(csv_file_path, year, month)

    # Load the existing CSV data
    old_data = pd.read_csv(csv_file_path)

    # Total number of days with missing data
    total_days = len(missing_days)
    new_data = []

    if total_days == 0:
        print(f"Data for {year}-{month}. Already Clean!")
        return None
    
    async with aiohttp.ClientSession() as session:
        with tqdm(total=total_days) as pbar:
            # Iterate over each missing day and attempt to fetch the data
            for missing_day in missing_days:
                timestamp_str = missing_day.strftime("%Y-%m-%d")
                try:
                    # Fetch the data asynchronously
                    df = await get_data_async(timestamp_str, data_set_name, session)
                    if not df.empty:
                        new_data.append(df)
                    pbar.update(1)  # Update progress bar for each day

                except Exception as e:
                    print(f"Error fetching {timestamp_str}: {e}. Retrying...")
                    await asyncio.sleep(5)  # Wait before retrying
                    try:
                        df = await get_data_async(timestamp_str, data_set_name, session)
                        if not df.empty:
                            new_data.append(df)
                            if(len(df)!=288):
                                print(timestamp_str,"only have",len(df),"datapoints")
                        pbar.update(1)
                    except Exception as e2:
                        print(f"Failed to fetch {timestamp_str}: {e2}")
                        await asyncio.sleep(8)  # Longer delay for retry failure
                        pbar.update(1)
                await asyncio.sleep(2)  # Introduce a 2-second delay between requests

        # If new data was successfully fetched, concatenate it with old data and save
        if new_data:
            # Concatenate old and new data
            combined = pd.concat([old_data] + new_data)
            
            # Remove duplicates based on the 'Timestamp' and 'stationId' columns
            combined = combined.drop_duplicates(subset=['Timestamp'])
            
            # Sort by 'Timestamp' in ascending order
            combined = combined.sort_values(by='Timestamp', ascending=True)
            
            # Save the combined data to CSV
            combined.to_csv(csv_file_path, index=False)

    print(f"Finished processing missing data for {year}-{month}.")


In [80]:
async def check_dataset(data_set_name):
    folder_path = f"../data/data_gov_sg/{data_set_name}_data/"

    # Get all CSV files in the folder
    try:
        files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    except Exception as e:
        print(f"Error reading directory {folder_path}: {e}")
        return

    # List to hold tasks for concurrent execution
    tasks = []

    # Extract year and month from each filename and check for missing data
    for file in files:
        # Regex to strictly match filenames like "rainfall_2020-01.csv", "rainfall_2020-12.csv"
        match = re.match(rf"{data_set_name}_(\d{{4}})-(\d{{2}})\.csv", file)
        if match:
            year = int(match.group(1))
            month = match.group(2)  # Keep the month as a two-digit string
            
            # Add the task to the list (running check_missing_month concurrently)
            tasks.append(check_missing_month(data_set_name, year, month))
        else:
            print(f"Skipping file with invalid format: {file}")
    
    # Run all tasks concurrently
    if tasks:
        await asyncio.gather(*tasks)
    else:
        print("No valid files to process")

In [81]:
await check_dataset("relative-humidity")

Data for 2020-04. Already Clean!
Data for 2020-05. Already Clean!


  0%|          | 0/2 [00:00<?, ?it/s]

Data for 2020-07. Already Clean!
Data for 2020-08. Already Clean!
Data for 2020-09. Already Clean!
Data for 2020-10. Already Clean!
Data for 2020-11. Already Clean!
Data for 2020-12. Already Clean!


  0%|          | 0/4 [00:00<?, ?it/s]

Data for 2021-02. Already Clean!
Data for 2021-03. Already Clean!
Data for 2021-04. Already Clean!
Data for 2021-05. Already Clean!
Data for 2021-06. Already Clean!
Data for 2021-07. Already Clean!
Data for 2021-08. Already Clean!
Data for 2021-09. Already Clean!
Data for 2021-10. Already Clean!
Data for 2021-11. Already Clean!
Data for 2021-12. Already Clean!
Data for 2022-01. Already Clean!
Data for 2022-02. Already Clean!
Data for 2022-03. Already Clean!
Data for 2022-04. Already Clean!
Data for 2022-05. Already Clean!
Data for 2022-06. Already Clean!
Data for 2022-07. Already Clean!
Data for 2022-08. Already Clean!
Data for 2022-09. Already Clean!
Data for 2022-10. Already Clean!
Data for 2022-11. Already Clean!
Data for 2022-12. Already Clean!


  0%|          | 0/8 [00:00<?, ?it/s]

Data for 2023-02. Already Clean!
Data for 2023-03. Already Clean!
Data for 2023-04. Already Clean!
Data for 2023-05. Already Clean!
Data for 2023-06. Already Clean!
Data for 2023-07. Already Clean!
Data for 2023-08. Already Clean!
Data for 2023-09. Already Clean!
Data for 2023-10. Already Clean!
Data for 2023-11. Already Clean!
Data for 2023-12. Already Clean!
Data for 2024-01. Already Clean!
Data for 2024-02. Already Clean!
Data for 2024-03. Already Clean!
Data for 2024-04. Already Clean!
Data for 2024-05. Already Clean!
Data for 2024-06. Already Clean!
Data for 2024-07. Already Clean!
Data for 2024-08. Already Clean!
Data for 2024-09. Already Clean!
Data for 2024-10. Already Clean!
Data for 2024-11. Already Clean!
Data for 2024-12. Already Clean!
Data for 2025-01. Already Clean!
Data for 2025-02. Already Clean!
Data for 2025-03. Already Clean!
Data for 2025-04. Already Clean!
Error fetching 2021-01-02: 'NoneType' object has no attribute 'get'. Retrying...
Error fetching 2023-01-02: '