In [2]:
### Md Shadman Sakib
### Downloads USGS data and saves it as a text file.
### to be done: create a system to take in and update the extracting based on the dates, just like NOAA

import os, sys, stat
import pandas as pd
import shutil
from datetime import datetime
from datetime import date
import numpy as np
import requests

'''
CPBModel:
James River Near Richmond, VA - 02037500
Mattaponi River Near Beulahville, VA - 01674500
Pamunkey River Near Hanover, VA - 01673000
Rappahannock River Near Fredericksburg, VA - 01668000
Potomac River Near Wash, DC Little Falls Pump Sta - 01646500
Patuxent River Near Bowie, MD - 01594440
Susquehanna River at Conowingo, MD - 01578310
Choptank River Near Greensboro, MD - 01491000
Dragon Swamp at Mascot, VA - 01669520
Northeast Branch Anacostia River at Riverdale, MD - 01649500
'''



# Specify the USGS IDs to be downloaded
USGS_ID_list = ["02037500", "01674500", "01673000", "01668000", "01646500", "01594440", "01578310", "01491000", "01669520", "01649500"]

# Specify the begin and end date of data
begin_date = "2001-01-01" # yyyy-mm-day
end_date = "2021-01-01"

# Specify download directory
download_dir = "../USGS_2012_Daily/"

if not os.path.exists(download_dir):
    os.makedirs(download_dir)

for usgs_site_no in USGS_ID_list:
    print("\nDownloading USGS Observation Data for {}".format(usgs_site_no))
    site_no = "{}".format(usgs_site_no+'.txt', "txt")
    url = "https://waterdata.usgs.gov/nwis/dv?cb_00060=on&format=rdb&site_no={}&referred_module=sw&period=&begin_date={}&end_date={}".format(usgs_site_no, begin_date, end_date)
    r = requests.get(url, allow_redirects=True)
    open(os.path.join(download_dir, site_no), 'wb').write(r.content)
    print("Downloaded Successfully and saved as {}".format(site_no))
    #print("\nNow Copying the files to a Temporary Directory!\n")



Downloading USGS Observation Data for 02037500
Downloaded Successfully and saved as 02037500.txt

Downloading USGS Observation Data for 01674500
Downloaded Successfully and saved as 01674500.txt

Downloading USGS Observation Data for 01673000
Downloaded Successfully and saved as 01673000.txt

Downloading USGS Observation Data for 01668000
Downloaded Successfully and saved as 01668000.txt

Downloading USGS Observation Data for 01646500
Downloaded Successfully and saved as 01646500.txt

Downloading USGS Observation Data for 01594440
Downloaded Successfully and saved as 01594440.txt

Downloading USGS Observation Data for 01578310
Downloaded Successfully and saved as 01578310.txt

Downloading USGS Observation Data for 01491000
Downloaded Successfully and saved as 01491000.txt

Downloading USGS Observation Data for 01669520
Downloaded Successfully and saved as 01669520.txt

Downloading USGS Observation Data for 01649500
Downloaded Successfully and saved as 01649500.txt


In [5]:
import numpy as np
import pandas as pd
import os
import sys
from datetime import datetime, date
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter

# Get the current working directory
pwd = os.getcwd()
print("Current working directory:", pwd)

# Specify the location of input USGS text file that contains the data
path_inputs = "../USGS_2012_Daily/"
dir_out = "../USGS_2012_Daily_CSV/"

# Check if the input directory exists
if not os.path.exists(path_inputs):
    raise FileNotFoundError(f"Input directory {path_inputs} does not exist")

# Create output directory if it does not exist
if not os.path.exists(dir_out):
    os.makedirs(dir_out)

# Defining a function to get the starting line number of USGS data
lookup_start = "WARNING"
lookup_end = "agency_cd"

def get_line_number(filename):
    """In a file, get the line number based on a given string"""
    with open(filename) as myFile:
        for num, line in enumerate(myFile, 1):
            if lookup_start in line:
                starting_line = num
            if lookup_end in line:
                ending_line = num
    return starting_line, ending_line

# Change to the input directory
os.chdir(path_inputs)
print("Changed directory to:", os.getcwd())

for filename in os.listdir():
    if filename.endswith('.txt'):
        # Check if file exists before processing
        file_path = os.path.join(path_inputs, filename)
        if not os.path.isfile(file_path):
            print(f"File {file_path} not found. Skipping.")
            continue

        # Reading the text files using pandas
        print("\nReading ", filename, " File")
        start_line, End_line = get_line_number(file_path)
        print("Starting line for this file is from line number #", int(End_line + 1))

        df_txt = pd.read_csv(file_path, sep="\t", header=None, skiprows=list(range(End_line + 1)))
        df_txt.fillna(method='ffill', inplace=True)

        # getting the value corresponding to the Datetime
        date_list = list()
        flow_list = list()

        for key, value in df_txt.iterrows():
            file_name = "{}{}".format("0", str(value[1]))
            file_ext = ".csv"
            new_name = "{}{}".format(file_name, file_ext)
            Title = "Plot of Observed Data for USGS ID {}".format(file_name)
            fig_name = '{}{}'.format(file_name, ".png")
            date_list.append(str(value[2]))
            if str(value[4]) == "nan":
                flow_list.append(value[7])
            else:
                flow_list.append(value[4])
        print("Created the flow data list for ", new_name)

        # Creating the dataframe
        df = pd.DataFrame(date_list, columns=['Date'])
        df = df.set_index('Date')
        start_date = date_list[0]
        end_date = date_list[-1]
        print("Created the date dataframe from ", start_date, " to ", end_date)
        print("Checking the length of the flow data list and date window!")
        print("The length of flow data list is ", len(flow_list))
        print("The length of date window is ", len(date_list))

        if len(date_list) != len(flow_list):
            raise ValueError("Mismatch found! Stopping the program! Please take care of missing data first")
            sys.exit()
        else:
            df["USGS_Flow(cfs)"] = flow_list
            df["USGS_Flow(cms)"] = np.round(np.array(flow_list) * 0.028316831998814504, 2)
            print("----> " + file_name, np.round(np.average(np.array(flow_list) * 0.028316831998814504)), 2)
            del flow_list
            df.to_csv(os.path.join(dir_out, new_name), index=True)
        print("The CSV Time Series for ", file_name, " has been created!\n")

# Change back to the original directory
os.chdir(pwd)
print("Successfully Created All Time Series as '.csv' Files From The '.txt' Files:")
print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
print(df.describe())
print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")


Current working directory: G:\My Drive\Deflt3D FM Codes - Vtech\DataDownloadAPIs\USGS_2012_Daily
Changed directory to: G:\My Drive\Deflt3D FM Codes - Vtech\DataDownloadAPIs\USGS_2012_Daily

Reading  02037500.txt  File
Starting line for this file is from line number # 26
Created the flow data list for  02037500.csv
Created the date dataframe from  2001-01-01  to  2021-01-01
Checking the length of the flow data list and date window!
The length of flow data list is  7306
The length of date window is  7306
----> 02037500 214.0 2
The CSV Time Series for  02037500  has been created!


Reading  01674500.txt  File
Starting line for this file is from line number # 26
Created the flow data list for  01674500.csv
Created the date dataframe from  2001-01-01  to  2021-01-01
Checking the length of the flow data list and date window!
The length of flow data list is  7306
The length of date window is  7306
----> 01674500 15.0 2
The CSV Time Series for  01674500  has been created!


Reading  01673000.t

In [30]:
np.average(flow_list)

94.84934706267484