## Large Dataset: Integrated Surface Dataset (Global)

**Dataset source:** https://www.ncei.noaa.gov/metadata/geoportal/rest/metadata/item/gov.noaa.ncdc:C00532/html

**S3 URL:** The data are stored on S3 in the bucket s3://noaa-isd-pds/ as fixed width text files (https://noaa-isd-pds.s3.amazonaws.com/index.html).

**Dataset description:** The Integrated Surface Dataset (**ISD**) is composed of worldwide surface weather observations from over 35,000 stations, though the best spatial coverage is evident in North America, Europe, Australia, and parts of Asia. Parameters included are: air quality, atmospheric pressure, atmospheric temperature/dew point, atmospheric winds, clouds, precipitation, ocean waves, tides and more. ISD refers to the data contained within the digital database as well as the format in which the hourly, synoptic (3-hourly), and daily weather observations are stored. The format conforms to Federal Information Processing Standards (FIPS). ISD provides hourly data that can be used in a wide range of climatological applications. For some stations, data may go as far back as 1901, though most data show a substantial increase in volume in the 1940s and again in the early 1970s. Currently, there are over 14,000 "active" stations updated daily in the database.

**Dataset Identifiers:** NCEI DSI 3505_03 

--------------------------------------------------------
--------------------------------------------------------

## Small Dataset: Integrated Surface Data - “Lite”
**Dataset description:** The ISD-Lite data contain a fixed-width formatted subset of the complete Integrated Surface Data (ISD) for a select number of observational elements. The data are typically stored in a single file corresponding to the ISD data, i.e. one file per station per year. 

ISD-Lite contains eight common hourly time-series climatological variables represented in a fixed-width format. The elements extracted are:
1. Air temperature (degrees Celsius * 10)
2. Dew point temperature (degrees Celsius * 10)
3. Sea level pressure (hectopascals)
4. Wind direction (angular degrees)
5. Wind speed (meters per second * 10)
6. Total cloud cover (coded, see format documentation)
7. One-hour accumulated liquid precipitation (millimeters)
8. Six-hour accumulated liquid precipitation (millimeters)
The ISD-Lite data are represented with a modified time stamp which corresponds to the nearest hour of actual observation. Sub-hourly observations were removed. Duplicate observations were resolved according to a ranking system.

Stations information data download:https://www.ncei.noaa.gov/maps/alltimes/

In [None]:
'''Extract data by NOAA FTP server and Transform'''
import os
import csv
import gzip
import json
from ftplib import FTP

def read_station_ids(file_path):
    with open(file_path, 'r') as f:
        reader = csv.reader(f)
        next(reader)  # Skip header
        station_ids = [row[0] for row in reader]
        station_ids = [id[:6] for id in station_ids]
        print(f"Read {len(station_ids)} station IDs")
        return station_ids

def download_isd_lite_data(ftp, station_id, year, output_dir):
    file_name = f'{station_id}-99999-{year}.gz'
    remote_path = f'/pub/data/noaa/isd-lite/{year}/{file_name}'
    local_path = os.path.join(output_dir, file_name)

    try:
        with open(local_path, 'wb') as f:
            ftp.retrbinary(f'RETR {remote_path}', f.write)

        if os.path.getsize(local_path) == 0:
            print(f"No data available for {file_name}")
            os.remove(local_path)
        else:
            print(f'Downloaded {file_name}')
            # Convert to JSON and save to the json directory
            json_dir = os.path.join(output_dir, 'json')  # Path to JSON directory
            convert_gz_to_json(local_path, json_dir)
            # Delete the .gz file after conversion
            os.remove(local_path)
    except Exception as e:
        print(f'no data available for {file_name} in selected time')
        #print(f'Error downloading {file_name}: {e}')

def convert_gz_to_json(gz_path, json_dir):
    # Ensure JSON directory exists
    if not os.path.exists(json_dir):
        os.makedirs(json_dir)

    # Build the path to the JSON file
    json_filename = os.path.splitext(os.path.basename(gz_path))[0] + '.json'
    json_path = os.path.join(json_dir, json_filename)

    try:
        # Read the .gz file and convert the content to JSON format
        with gzip.open(gz_path, 'rt') as gz_file:  # 'rt' mode means reading text data
            data = gz_file.read()  # Assuming the file content can be loaded into memory at once
            json_data = parse_gz_data_to_json(data)  # Parsing function

            # Save JSON data to a file
            with open(json_path, 'w') as json_file:
                json.dump(json_data, json_file, indent=4)

        print(f'Converted {os.path.basename(gz_path)} to JSON and stored in {json_dir}')
    except Exception as e:
        print(f'Error converting {gz_path} to JSON: {e}')

def parse_gz_data_to_json(data):
    json_data = []
    lines = data.splitlines()
    for line in lines:
        values = line.split()
        if len(values) == 12:  # Ensure each line of data has 12 values
            entry = {
                "Year": int(values[0]),
                "Month": int(values[1]),
                "Day": int(values[2]),
                "Hour": int(values[3]),
                "Temperature": int(values[4]),
                "Td": int(values[5]),
                "Pressure": int(values[6]),
                "WindDirection": int(values[7]),
                "WindSpeed": int(values[8]),
                "CloudCover": int(values[9]),
                "Rain1h": int(values[10]),
                "Rain6h": int(values[11]),
            }
            json_data.append(entry)
    return json_data

def download_contry_isd_lite_data(stations_file, start_year, end_year, output_dir):
    ftp = FTP('ftp.ncdc.noaa.gov')
    ftp.login()

    station_ids = read_station_ids(stations_file)

    for station_id in station_ids:
        for year in range(start_year, end_year + 1):
            download_isd_lite_data(ftp, station_id, year, output_dir)

    ftp.quit()

def combine_json_files(json_dir, combined_json_dir):
    os.makedirs(combined_json_dir, exist_ok=True)

    json_files = [f for f in os.listdir(json_dir) if f.endswith('.json')]

    # Group files and sort them according to rules
    file_groups = {}
    for json_file in json_files:
        prefix = '-'.join(json_file.split('-')[:2])
        if prefix not in file_groups:
            file_groups[prefix] = []
        file_groups[prefix].append(json_file)

    # Sort files by year and combine
    for prefix, files in file_groups.items():
        sorted_files = sorted(files, key=lambda x: int(x.split('-')[2].split('.')[0]))

        combined_data = []
        for file_name in sorted_files:
            with open(os.path.join(json_dir, file_name), 'r') as f:
                data = json.load(f)
                combined_data.extend(data)
            # Delete the individual JSON file after reading its data
            os.remove(os.path.join(json_dir, file_name))

        start_year1 = sorted_files[0].split('-')[2].split('.')[0]
        end_year1 = sorted_files[-1].split('-')[2].split('.')[0]

        combined_file_name = f"{prefix}-{start_year1}-{end_year1}.json"
        with open(os.path.join(combined_json_dir, combined_file_name), 'w') as f:
            json.dump(combined_data, f, indent=4)

        print(f"Combined {len(sorted_files)} files into {combined_file_name}")

if __name__ == '__main__':
    stations_file = 'US.csv'
    # US：United States
    # Massachusetts: Massachusetts in US
    start_year = 2020
    end_year = 2024
    base_directory = '/Users/a1234/Desktop/workspace/CS779/WeatherDB/dataset'
    folder_name = f"{start_year}_{end_year}_{os.path.splitext(stations_file)[0]}"
    folder_path = os.path.join(base_directory, folder_name)

    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    output_dir = folder_path

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    download_contry_isd_lite_data(stations_file, start_year, end_year, output_dir)

    # file path to combine JSON
    combined_json_dir = os.path.join(output_dir, 'conjson')
    # combine JSON
    combine_json_files(os.path.join(output_dir, 'json'), combined_json_dir)

    print('------------------Download Over-----------------')

Read 1000 station IDs
no data available for 999999-99999-2020.gz in selected time
no data available for 999999-99999-2021.gz in selected time
no data available for 999999-99999-2022.gz in selected time
no data available for 999999-99999-2023.gz in selected time
no data available for 999999-99999-2024.gz in selected time
no data available for 698414-99999-2020.gz in selected time
no data available for 698414-99999-2021.gz in selected time
no data available for 698414-99999-2022.gz in selected time
no data available for 698414-99999-2023.gz in selected time
no data available for 698414-99999-2024.gz in selected time
no data available for 725771-99999-2020.gz in selected time
no data available for 725771-99999-2021.gz in selected time
no data available for 725771-99999-2022.gz in selected time
no data available for 725771-99999-2023.gz in selected time
no data available for 725771-99999-2024.gz in selected time
Downloaded 997258-99999-2020.gz
Converted 997258-99999-2020.gz to JSON and sto