In [1]:
import pandas as pd
import numpy as np
import os
# import wget
import requests
from bs4 import BeautifulSoup
import random
import re

In [2]:
year = 2008

url = f"https://www.ncei.noaa.gov/data/local-climatological-data/access/{year}/"
response = requests.get(url)
html_content = response.text

# Step 2: Parse the HTML and extract links
soup = BeautifulSoup(html_content, 'html.parser')
# links = soup.find_all('a', href=True, attrs={'href': lambda href: href.endswith('.csv')})
links = soup.find_all('a', href=True)

In [3]:
csv_hrefs = [link['href'] for link in links if re.match(r".+\.csv$", link['href'])]


In [4]:
csv_hrefs[:10]

['01001099999.csv',
 '01001499999.csv',
 '01003099999.csv',
 '01007099999.csv',
 '01008099999.csv',
 '01010099999.csv',
 '01014099999.csv',
 '01015099999.csv',
 '01023099999.csv',
 '01023199999.csv']

In [5]:
download_urls = [url + csv_name for csv_name in csv_hrefs]

In [6]:
len(download_urls)

11676

In [7]:
download_urls[:10]

['https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/01001099999.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/01001499999.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/01003099999.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/01007099999.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/01008099999.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/01010099999.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/01014099999.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/01015099999.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/01023099999.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/01023199999.csv']

In [8]:
# basic for loop
# useful_df = []
# n_locs = 5

# for idx, url in enumerate(download_urls):
#     df = pd.read_csv(url)
#     monthly_agg_cols = [i for i in df.columns if i.startswith('Monthly')]
#     df_agg = df[ monthly_agg_cols]
#     print(f'read {idx+1} csv')
#     if df_agg.isnull().all().all():
#         print('not useful df')
#     else:
#         useful_df.append(url)
#         print(f'useful df found')

#     if len(useful_df) == n_locs:
#         print(f'found enough urls, breaking...')
#         break
        

In [9]:
# not for jupyter
# import concurrent.futures
# import pandas as pd

# useful_df = []
# n_locs = 5

# # Define the download_urls
# # download_urls = [...]  # Your list of download URLs

# def process_url(url):
#     df = pd.read_csv(url)
#     monthly_agg_cols = [i for i in df.columns if i.startswith('Monthly')]
#     df_agg = df[monthly_agg_cols]
    
#     if not df_agg.isnull().all().all():
#         useful_df.append(url)
#         print(f'***** useful df found for {url}')
#         if len(useful_df) == n_locs:
#             print('Found enough URLs, breaking...')

# with concurrent.futures.ProcessPoolExecutor() as executor:
#     executor.map(process_url, download_urls)

In [10]:
import concurrent.futures
import threading
import pandas as pd
import time


useful_df = []
n_locs = 2
lock = threading.Lock()

# Define the download_urls
files_read = 0  # Initialize files_read

def process_url(url):
    global files_read

    # cols_to_read = ['MonthlyAverageRH', 'MonthlyDewpointTemperature', 'MonthlyMeanTemperature',
    #                 'MonthlySeaLevelPressure', 'MonthlyStationPressure', 'MonthlyTotalLiquidPrecipitation',
    #                 'MonthlyTotalSnowfall','MonthlyWetBulb']
    cols_to_read = ['MonthlyMeanTemperature']

    df = pd.read_csv(url, usecols = cols_to_read)
    # monthly_agg_cols = [i for i in df.columns if i.startswith('Monthly')]
    # df_agg = df[monthly_agg_cols]

    # df_agg = df.copy()

    with lock:
        files_read += 1
        print(f'Read {files_read} csv', flush=True)

    if df.isnull().all().all(): # df_agg
        print(f'not useful df {url}')
    else:
        useful_df.append(url)
        print(f'***** useful df found for {url}')
        
        with lock:
            if len(useful_df) == n_locs:
                print('Found enough URLs, breaking...')
                return False
            
    return True

with concurrent.futures.ThreadPoolExecutor() as executor:
    num_files_to_read = n_locs * 10
    executor.map(process_url, download_urls[::-1][:num_files_to_read])

Read 1 csv
***** useful df found for https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/A0000953862.csv
Read 2 csv
not useful df https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/A0003225715.csv
Read 3 csv
***** useful df found for https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/A0002363890.csv
Found enough URLs, breaking...
Read 4 csv
not useful df https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/A0001263879.csv
Read 5 csv
not useful df https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/A0001163848.csv
Read 6 csv
not useful df https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/A0001704868.csv
Read 7 csv
not useful df https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/A0002163884.csv
Read 8 csv
not useful df https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/A0001953969.csv
Read 9 csv
not useful df https://www.ncei.noaa.gov/data/local-cli

In [11]:
useful_df

['https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/A0000953862.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/A0002363890.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/A0002453848.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/99999994290.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/99999994645.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/99999994995.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/99999994084.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/99999994082.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/99999994088.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/99999994644.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/99999994996.csv',
 'https://www.ncei.no

In [16]:
df_default_links = ['https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/A0000953862.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/A0002363890.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/A0002453848.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/99999994290.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/99999994645.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/99999994995.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/99999994084.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/99999994082.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/99999994088.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/99999994644.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/99999994996.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/99999994080.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/99999994085.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/99999994078.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/99999994079.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/99999992811.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/99999973803.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/99999994081.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/99999993245.csv',
 'https://www.ncei.noaa.gov/data/local-climatological-data/access/2008/99999994075.csv',
 ]

In [18]:
import wget
data_dir = "../data/default"
for idx, link in enumerate(df_default_links):
    csv_name = link.split("/")[-1]
    filename = os.path.join(data_dir, csv_name)

    df = pd.read_csv(link)
    df.head()
    # wget.download(link, filename)

    if idx == 5: break

  df = pd.read_csv(link)
  df = pd.read_csv(link)
  df = pd.read_csv(link)


KeyboardInterrupt: 

In [2]:
dfs = ['https://www.ncei.noaa.gov/data/local-climatological-data/access/2012/99999994728.csv',
       'https://www.ncei.noaa.gov/data/local-climatological-data/access/2012/99999994290.csv']

In [None]:
for link in dfs:
    df = pd.read_csv()

In [12]:
cols_to_read = ['MonthlyAverageRH', 'MonthlyDewpointTemperature', 'MonthlyMeanTemperature',
                'MonthlySeaLevelPressure', 'MonthlyStationPressure', 'MonthlyTotalLiquidPrecipitation',
                'MonthlyTotalSnowfall','MonthlyWetBulb']
df = pd.read_csv("https://www.ncei.noaa.gov/data/local-climatological-data/access/2010/01088699999.csv",
                  usecols = cols_to_read)
df.head()

Unnamed: 0,MonthlyAverageRH,MonthlyDewpointTemperature,MonthlyMeanTemperature,MonthlySeaLevelPressure,MonthlyStationPressure,MonthlyTotalLiquidPrecipitation,MonthlyTotalSnowfall,MonthlyWetBulb
0,,,,,,,,
1,,,,,,,,
2,,,,,,,,
3,,,,,,,,
4,,,,,,,,


In [14]:
df.isnull().all()

MonthlyAverageRH                   True
MonthlyDewpointTemperature         True
MonthlyMeanTemperature             True
MonthlySeaLevelPressure            True
MonthlyStationPressure             True
MonthlyTotalLiquidPrecipitation    True
MonthlyTotalSnowfall               True
MonthlyWetBulb                     True
dtype: bool

In [None]:

# Step 3: Select the desired number of files
selected_links = random.sample(links, n_locs)
download_urls = [url + link['href'] for link in selected_links]
# a03_logger.info(f'download_urls are {download_urls}' )

In [1]:
input_dir = "../data/raw"
# output_file = "data/processed/ground_truth.csv"
output_file = "../outputs/processed/ground_truth.csv"
input_files = [os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith(".csv")]

NameError: name 'os' is not defined

In [None]:
input_files

['../data/raw\\48381099999.csv',
 '../data/raw\\54337099999.csv',
 '../data/raw\\57483099999.csv']

In [None]:
dfs = []
for file in input_files:
    df = pd.read_csv(file)
    # print(df.head(1))
    print(df.columns)
    # monthly_agg = df.groupby(pd.Grouper(freq='M')).mean()
    # dfs.append(monthly_agg)

       STATION                 DATE   LATITUDE   LONGITUDE  ELEVATION  \
0  48381099999  1960-01-01T07:00:00  16.466628  102.783661     204.21   

            NAME REPORT_TYPE  SOURCE  HourlyAltimeterSetting  \
0  KHON KAEN, TH       FM-12       4                     NaN   

   HourlyDewPointTemperature  ... BackupDirection  BackupDistance  \
0                         54  ...             NaN             NaN   

  BackupDistanceUnit  BackupElements  BackupElevation  BackupEquipment  \
0                NaN             NaN              NaN              NaN   

   BackupLatitude  BackupLongitude  BackupName  WindEquipmentChangeDate  
0             NaN              NaN         NaN                      NaN  

[1 rows x 125 columns]
Index(['STATION', 'DATE', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'NAME',
       'REPORT_TYPE', 'SOURCE', 'HourlyAltimeterSetting',
       'HourlyDewPointTemperature',
       ...
       'BackupDirection', 'BackupDistance', 'BackupDistanceUnit',
       'BackupElement

In [None]:
[i for i in df.columns]

['STATION',
 'DATE',
 'LATITUDE',
 'LONGITUDE',
 'ELEVATION',
 'NAME',
 'REPORT_TYPE',
 'SOURCE',
 'HourlyAltimeterSetting',
 'HourlyDewPointTemperature',
 'HourlyDryBulbTemperature',
 'HourlyPrecipitation',
 'HourlyPresentWeatherType',
 'HourlyPressureChange',
 'HourlyPressureTendency',
 'HourlyRelativeHumidity',
 'HourlySkyConditions',
 'HourlySeaLevelPressure',
 'HourlyStationPressure',
 'HourlyVisibility',
 'HourlyWetBulbTemperature',
 'HourlyWindDirection',
 'HourlyWindGustSpeed',
 'HourlyWindSpeed',
 'Sunrise',
 'Sunset',
 'DailyAverageDewPointTemperature',
 'DailyAverageDryBulbTemperature',
 'DailyAverageRelativeHumidity',
 'DailyAverageSeaLevelPressure',
 'DailyAverageStationPressure',
 'DailyAverageWetBulbTemperature',
 'DailyAverageWindSpeed',
 'DailyCoolingDegreeDays',
 'DailyDepartureFromNormalAverageTemperature',
 'DailyHeatingDegreeDays',
 'DailyMaximumDryBulbTemperature',
 'DailyMinimumDryBulbTemperature',
 'DailyPeakWindDirection',
 'DailyPeakWindSpeed',
 'DailyPrecipit

In [None]:
combined_df = pd.concat(dfs)
combined_df.to_csv(output_file, index=True)

ValueError: No objects to concatenate