## Step 10: Gather Source Datasets

Purpose: Download and stage required datasets into the project **source** geodatabase.  

Current list of resources:

- [CRWU_CREAT_Grid_Projections](https://services.arcgis.com/cJ9YHowT8TU7DUyn/ArcGIS/rest/services/CRWU_CREAT_Grid_Projections/FeatureServer/0) from EPA Geoplatform
- [CRWU_CREAT_Historic_Climate_Stations](https://services.arcgis.com/cJ9YHowT8TU7DUyn/ArcGIS/rest/services/CRWU_CREAT_Historic_Climate_Stations/FeatureServer/0) from EPA Geoplatform
- [COOP_STATIONS_TO_USE](https://github.com/barrc/get_ncei/blob/main/src/coop_stations_to_use.csv) from barrc GitHub
- [ISD_STATIONS_TO_USE](https://github.com/barrc/get_ncei/blob/main/src/isd_stations_to_use.csv) from barrc Github
- [TEMPORAL_DIST_FILE](https://github.com/barrc/extreme_events/blob/main/temporal_dist_file.txt) from barrc GitHub
- [Census States](https://tigerweb.geo.census.gov/arcgis/rest/services/TIGERweb/State_County/MapServer/0) from US Census Tigerweb


In [10]:
import arcpy;
import os,sys;
import csv,importlib;

print("Executing Step 10: Gather Source Datasets");

import swc_resources;
importlib.reload(swc_resources);
rez = swc_resources.rez();


Executing Step 10: Gather Source Datasets


### 10.010: Download CRWU_CREAT_Grid_Projections from EPA Geoplatform

In [11]:
%%time

host = "services.arcgis.com";
path = "/cJ9YHowT8TU7DUyn/ArcGIS/rest/services/CRWU_CREAT_Grid_Projections/FeatureServer/0";
fc   = "CRWU_CREAT_Grid_Projections";

if arcpy.Exists(rez['source'] + os.sep + fc):
    arcpy.Delete_management(rez['source'] + os.sep + fc);

print("  Downloading " + fc);
z = swc_resources.scrape_ags(host,path,rez['source'],fc);

print("  Adding indexes");
z = arcpy.management.AddIndex(
     in_table   = rez['source'] + os.sep + fc
    ,fields     = 'CREAT_ID'
    ,index_name = 'CREAT_ID_IDX'
);

z = arcpy.management.AddIndex(
     in_table   = rez['source'] + os.sep + fc
    ,fields     = 'GRIDCODE'
    ,index_name = 'GRIDCODE_IDX'
);

print("  Complete.");


  Downloading CRWU_CREAT_Grid_Projections
  Adding indexes
  Complete.
Wall time: 4min 47s


### 10.020: Download CRWU_CREAT_Historic_Climate_Stations from EPA Geoplatform

In [12]:
%%time

host = "services.arcgis.com";
path = "/cJ9YHowT8TU7DUyn/ArcGIS/rest/services/CRWU_CREAT_Historic_Climate_Stations/FeatureServer/0";
fc   = "CRWU_CREAT_Historic_Climate_Stations";

if arcpy.Exists(rez['source'] + os.sep + fc):
    arcpy.Delete_management(rez['source'] + os.sep + fc);

print("  Downloading " + fc);
z = swc_resources.scrape_ags(host,path,rez['source'],fc);

print("  Adding indexes");
z = arcpy.management.AddIndex(
     in_table   = rez['source'] + os.sep + fc
    ,fields     = 'CLIMATE_STATION_PK_ID'
    ,index_name = 'CLIMATE_STATION_PK_ID_IDX'
);

z = arcpy.management.AddIndex(
     in_table   = rez['source'] + os.sep + fc
    ,fields     = 'NOAA_STATION_ID'
    ,index_name = 'NOAA_STATION_ID_IDX'
);

print("  Complete.");


  Downloading CRWU_CREAT_Historic_Climate_Stations
  Adding indexes
  Complete.
Wall time: 9.92 s


### 10.030: Download COOP_STATIONS_TO_USE dataset from barrc GitHub repository

In [13]:
%%time

url = "https://raw.githubusercontent.com/barrc/get_ncei/master/src/coop_stations_to_use.csv"
fc  = 'COOP_STATIONS_TO_USE';

tmptab = rez['qa'] + os.sep + 'coop_stations_to_use.csv';
z = swc_resources.downloadtab(url,tmptab);

fmscoop = arcpy.FieldMappings();
fmscoop.addFieldMap(swc_resources.fmtext  (tmptab,'station_id',255));
fmscoop.addFieldMap(swc_resources.fmtext  (tmptab,'station_name',255));
fmscoop.addFieldMap(swc_resources.fmtext  (tmptab,'state',255));
fmscoop.addFieldMap(swc_resources.fmtext  (tmptab,'start_date',255));
fmscoop.addFieldMap(swc_resources.fmtext  (tmptab,'end_date',255));
fmscoop.addFieldMap(swc_resources.fmdouble(tmptab,'latitude'));
fmscoop.addFieldMap(swc_resources.fmdouble(tmptab,'longitude'));
fmscoop.addFieldMap(swc_resources.fmtext  (tmptab,'in_basins',255));
fmscoop.addFieldMap(swc_resources.fmtext  (tmptab,'break_with_basins',255));
fmscoop.addFieldMap(swc_resources.fmtext  (tmptab,'network',255));
fmscoop.addFieldMap(swc_resources.fmtext  (tmptab,'start_date_to_use',255));
fmscoop.addFieldMap(swc_resources.fmtext  (tmptab,'end_date_to_use',255));

z = swc_resources.tab2fc(tmptab,rez['source'],fc,'longitude','latitude',fmscoop);

print("  check for missing stationIDs");
z = arcpy.management.CalculateField(
     in_table        = rez['source'] + os.sep + fc
    ,field           = 'station_id'
    ,expression      = "injectID(!station_id!,!OBJECTID!)"
    ,expression_type = 'PYTHON3'
    ,code_block      = """
def injectID(pin,oid):
    if pin is None:
        return 'COOP' + str(oid);
    else:
        return pin;
""");

print("  add quotes to start and end fields");
cb_cleanDate = """
def cleanDate(pin):
    (mm,dd,yyyy) = pin.split('/');
    if mm in ['1','2','3','4','5','6','7','8','9']:
       mm = '0' + mm;
    if dd in ['1','2','3','4','5','6','7','8','9']:
       dd = '0' + dd;
    return "'" + yyyy + "/" + mm + "/" + dd + "'";
    
""";

z = arcpy.management.AddField(
     in_table     = rez['source'] + os.sep + fc
    ,field_name   = 'start_date_clean'
    ,field_type   = 'Text'
    ,field_length = 255
    ,field_alias  = 'start_date_clean'
);

z = arcpy.management.CalculateField(
     in_table        = rez['source'] + os.sep + fc
    ,field           = 'start_date_clean'
    ,expression      = "cleanDate(!start_date!)"
    ,expression_type = 'PYTHON3'
    ,code_block      = cb_cleanDate
);

z = arcpy.management.AddField(
     in_table     = rez['source'] + os.sep + fc
    ,field_name   = 'end_date_clean'
    ,field_type   = 'Text'
    ,field_length = 255
    ,field_alias  = 'end_date_clean'
);

z = arcpy.management.CalculateField(
     in_table        = rez['source'] + os.sep + fc
    ,field           = 'end_date_clean'
    ,expression      = "cleanDate(!end_date!)"
    ,expression_type = 'PYTHON3'
    ,code_block      = cb_cleanDate
);

z = arcpy.management.AddField(
     in_table     = rez['source'] + os.sep + fc
    ,field_name   = 'start_date_to_use_clean'
    ,field_type   = 'Text'
    ,field_length = 255
    ,field_alias  = 'start_date_to_use_clean'
);

z = arcpy.management.CalculateField(
     in_table        = rez['source'] + os.sep + fc
    ,field           = 'start_date_to_use_clean'
    ,expression      = "cleanDate(!start_date_to_use!)"
    ,expression_type = 'PYTHON3'
    ,code_block      = cb_cleanDate
);

z = arcpy.management.AddField(
     in_table     = rez['source'] + os.sep + fc
    ,field_name   = 'end_date_to_use_clean'
    ,field_type   = 'Text'
    ,field_length = 255
    ,field_alias  = 'end_date_to_use_clean'
);

z = arcpy.management.CalculateField(
     in_table        = rez['source'] + os.sep + fc
    ,field           = 'end_date_to_use_clean'
    ,expression      = "cleanDate(!end_date_to_use!)"
    ,expression_type = 'PYTHON3'
    ,code_block      = cb_cleanDate
);

print("  calculating year count");
cb_yearCount = """
import datetime;
def yearCount(pstart,pend):
    d1 = datetime.datetime.strptime(pstart,"%m/%d/%Y");
    d2 = datetime.datetime.strptime(pend  ,"%m/%d/%Y");
    yr = round((d2 - d1).days / 365);
    return yr + 0.0;
    
""";

z = arcpy.management.AddField(
     in_table     = rez['source'] + os.sep + fc
    ,field_name   = 'year_count'
    ,field_type   = 'Double'
    ,field_alias  = 'year_count'
);

z = arcpy.management.CalculateField(
     in_table        = rez['source'] + os.sep + fc
    ,field           = 'year_count'
    ,expression      = 'yearCount(!start_date_to_use!,!end_date_to_use!)'
    ,expression_type = 'PYTHON3'
    ,code_block      = cb_yearCount
);

print("  adding indexes");
z = arcpy.management.AddIndex(
     in_table      = rez['source']+ os.sep + fc
    ,fields        = 'station_id'
    ,index_name    = 'station_id_IDX'
);


  downloading file
  loading to table
  converting to NAD83 points
  check for missing stationIDs
  add quotes to start and end fields
  calculating year count
  adding indexes
Wall time: 8.92 s


### 10.040: Download ISD_STATIONS_TO_USE dataset from barrc GitHub repository

Note originally this ISD processing step was coded similarly to the above cell processing the COOP csv file using the built-in arcpy TableToTable tools.  Even though the logic provides an expressed fieldmapping stating that the station id is textual in nature, __sometimes__ the tool would choke on the final 15 records (those having an ID beginning with 'A').  But just sometimes.  It is not a problem I have been able to reliably reproduce.  The best solution is to just "don't do that" and instead load the csv file using the Python csv library and an arcpy DA cursor.  I remain uncertain as to why this is the case but suspect the UTF-8 characters in the ISD csv file __might__ have something to do with it.  

In [14]:
%%time

url = "https://raw.githubusercontent.com/barrc/get_ncei/master/src/isd_stations_to_use.csv"
fc  = 'ISD_STATIONS_TO_USE';

outcsv10040 = rez['qa'] + os.sep + 'isd_stations_to_use.csv';
z = swc_resources.downloadtab(url,outcsv10040);

output10040 = rez['source'] + os.sep + fc;
if arcpy.Exists(output10040):
    arcpy.Delete_management(output10040);

arcpy.CreateFeatureclass_management(
     out_path      = os.path.dirname(output10040)
    ,out_name      = os.path.basename(output10040)
    ,geometry_type = "POINT"
    ,has_m         = "DISABLED"
    ,has_z         = "DISABLED"
    ,spatial_reference = arcpy.SpatialReference(4269) 
);

arcpy.management.AddFields(
     in_table          = output10040
    ,field_description = [
         ['station_id'       ,'TEXT'  ,'Station ID'       ,255]
        ,['station_name'     ,'TEXT'  ,'Station Name'     ,255]
        ,['state'            ,'TEXT'  ,'State'            ,255]
        ,['start_date'       ,'TEXT'  ,'Start Date'       ,255]
        ,['end_date'         ,'TEXT'  ,'End Date'         ,255]
        ,['latitude'         ,'DOUBLE','Latitude'         ,255]
        ,['longitude'        ,'DOUBLE','Longitude'        ,255]
        ,['in_basins'        ,'TEXT'  ,'In Basins'        ,255]
        ,['break_with_basins','TEXT'  ,'Break With Basins',255]
        ,['network'          ,'TEXT'  ,'Network'          ,255]
        ,['start_date_clean' ,'TEXT'  ,'Start Date Clean' ,255]
        ,['end_date_clean'   ,'TEXT'  ,'End Date Clean'   ,255]
        ,['year_count'       ,'DOUBLE','Year Count'           ]
    ]
);

print("  Adding indexes");
z = arcpy.management.AddIndex(
     in_table   = output10040
    ,fields     = 'Station_id'
    ,index_name = 'StationId_IDX'
);

fldsout = [
     'station_id'
    ,'station_name'
    ,'state'
    ,'start_date'
    ,'end_date'
    ,'latitude'
    ,'longitude'
    ,'in_basins'
    ,'break_with_basins'
    ,'network'
    ,'start_date_clean'
    ,'end_date_clean'
    ,'year_count'
    ,'SHAPE@'
];

print("  Loading table from file")
with arcpy.da.InsertCursor(
     in_table    = output10040
    ,field_names = fldsout
) as outcur:

    with open(outcsv10040,newline='',encoding='utf-8') as tdf:
        next(tdf);
        
        reader = csv.reader(tdf,delimiter=',');
        for row in reader:
            
            start_date        = datetime.datetime.strptime(row[3] ,"%Y-%m-%d %H:%M:%S");
            end_date          = datetime.datetime.strptime(row[4] ,"%Y-%m-%d %H:%M:%S");
            
            year_count = round((end_date - start_date).days / 365);
            year_count = year_count + 0.0;
            
            lat = float(row[5]);
            lng = float(row[6]);
            
            pt = arcpy.Point();
            pt.X = lng;
            pt.Y = lat;
            
            if lat == 0 or lng == 0:
                pass;
            else:
                outcur.insertRow((
                     row[0]
                    ,row[1]
                    ,row[2]
                    ,row[3]
                    ,row[4]
                    ,lat
                    ,lng
                    ,row[7]
                    ,row[8]
                    ,row[9]
                    ,"'" + start_date.strftime('%Y/%m/%d') + "'"
                    ,"'" + end_date.strftime('%Y/%m/%d') + "'"
                    ,year_count
                    ,arcpy.PointGeometry(pt)
                ));

print("  Complete");
del outcsv10040,output10040;


  downloading file
  Adding indexes
  Loading table from file
  Complete
Wall time: 3.65 s


### 10.050: Download TEMPORAL_DIST_FILE dataset from barrc GitHub repository

In [15]:
%%time

url = "https://raw.githubusercontent.com/barrc/extreme_events/main/temporal_dist_file.txt"
fc  = 'TEMPORAL_DIST_FILE';

tmptab = rez['qa'] + os.sep + 'temporal_dist_file.tab';
z = swc_resources.downloadtab(url,tmptab);

fms = arcpy.FieldMappings();
fms.addFieldMap(swc_resources.fmdouble(tmptab,'Time'));
fms.addFieldMap(swc_resources.fmdouble(tmptab,'CA_1'));
fms.addFieldMap(swc_resources.fmdouble(tmptab,'CA_2'));
fms.addFieldMap(swc_resources.fmdouble(tmptab,'CA_3'));
fms.addFieldMap(swc_resources.fmdouble(tmptab,'CA_4'));
fms.addFieldMap(swc_resources.fmdouble(tmptab,'CA_5'));
fms.addFieldMap(swc_resources.fmdouble(tmptab,'CA_6'));
fms.addFieldMap(swc_resources.fmdouble(tmptab,'MSE_1'));
fms.addFieldMap(swc_resources.fmdouble(tmptab,'MSE_2'));
fms.addFieldMap(swc_resources.fmdouble(tmptab,'MSE_3'));
fms.addFieldMap(swc_resources.fmdouble(tmptab,'MSE_4'));
fms.addFieldMap(swc_resources.fmdouble(tmptab,'MSE_5'));
fms.addFieldMap(swc_resources.fmdouble(tmptab,'MSE_6'));
fms.addFieldMap(swc_resources.fmdouble(tmptab,'NOAA_A'));
fms.addFieldMap(swc_resources.fmdouble(tmptab,'NOAA_B'));
fms.addFieldMap(swc_resources.fmdouble(tmptab,'NOAA_C'));
fms.addFieldMap(swc_resources.fmdouble(tmptab,'NOAA_D'));
fms.addFieldMap(swc_resources.fmdouble(tmptab,'NRCC_A'));
fms.addFieldMap(swc_resources.fmdouble(tmptab,'NRCC_B'));
fms.addFieldMap(swc_resources.fmdouble(tmptab,'NRCC_C'));
fms.addFieldMap(swc_resources.fmdouble(tmptab,'NRCC_D'));
fms.addFieldMap(swc_resources.fmdouble(tmptab,'NV_N'));
fms.addFieldMap(swc_resources.fmdouble(tmptab,'NV_S'));
fms.addFieldMap(swc_resources.fmdouble(tmptab,'NV_W'));
fms.addFieldMap(swc_resources.fmdouble(tmptab,'SCS_I'));
fms.addFieldMap(swc_resources.fmdouble(tmptab,'SCS_IA'));
fms.addFieldMap(swc_resources.fmdouble(tmptab,'SCS_II'));
fms.addFieldMap(swc_resources.fmdouble(tmptab,'SCS_III'));

z = swc_resources.tab2tab(tmptab,rez['source'],fc,fms);

print("  adding indexes");
z = arcpy.management.AddIndex(
     in_table   = rez['source'] + os.sep + fc
    ,fields     = 'time'
    ,index_name = 'time_IDX'
);


  downloading file
  loading to table
  adding indexes
Wall time: 1.91 s


### 10.060: Download US Census Tigerweb 2020 State Coverage

In [16]:
%%time

# Note tigerweb will timeout if all state-equivalent records are requested in one go.
# Setting the forcelimit value to 5 records at once works around the problem.

host = "tigerweb.geo.census.gov";
path = "/arcgis/rest/services/TIGERweb/State_County/MapServer/0";
fc   = "census_states";

if arcpy.Exists(rez['source'] + os.sep + fc):
    arcpy.Delete_management(rez['source'] + os.sep + fc);

print("  Downloading " + fc);
z = swc_resources.scrape_ags(host,path,rez['source'],fc,5);

print("  Adding indexes");
z = arcpy.management.AddIndex(
     in_table   = rez['source'] + os.sep + fc
    ,fields     = 'GEOID'
    ,index_name = 'GEOID_IDX'
);


  Downloading census_states
  Adding indexes
Wall time: 1min 6s


### 10.070: Review and QA

QA Products:

1. flat files saved to **qa** folder
2. counts saved to **step10qa.txt**


In [17]:
%%time

grid = rez['source']  + os.sep + 'CRWU_CREAT_Grid_Projections';
grid_cnt = arcpy.GetCount_management(grid)[0];
hist = rez['source']  + os.sep + 'CRWU_CREAT_Historic_Climate_Stations';
hist_cnt = arcpy.GetCount_management(hist)[0];
coop = rez['source']  + os.sep + 'COOP_STATIONS_TO_USE';
coop_cnt = arcpy.GetCount_management(coop)[0];
isd  = rez['source']  + os.sep + 'ISD_STATIONS_TO_USE';
isd_cnt = arcpy.GetCount_management(isd)[0];
tdf  = rez['source']  + os.sep + 'TEMPORAL_DIST_FILE';
tdf_cnt = arcpy.GetCount_management(tdf)[0];
states  = rez['source'] + os.sep + 'census_states';
states_cnt = arcpy.GetCount_management(states)[0];

print("  Grid Projections : " + str(grid_cnt));
print("  Historic Stations: " + str(hist_cnt));
print("  COOP Stations    : " + str(coop_cnt));
print("  ISD Stations     : " + str(isd_cnt));
print("  Tigerweb States  : " + str(states_cnt));
print(" ");

nw = datetime.datetime.now();
with open(rez['qa'] + os.sep + 'step10qa.txt',"w") as out:
    out.write("Step 10 QA Review\n");
    out.write(datetime.datetime.now().isoformat() + "\n");
    out.write("Grid Projections Loaded: " + str(grid_cnt) + "\n");
    out.write("Historic Stations Loaded: " + str(hist_cnt) + "\n");
    out.write("COOP Stations Loaded: " + str(coop_cnt) + "\n");
    out.write("ISD Stations Loaded: " + str(isd_cnt) + "\n");
    out.write("Temp Dist File Records Loaded: " + str(tdf_cnt) + "\n");
    out.write("Tigerweb States Loaded: " + str(states_cnt) + "\n");


  Grid Projections : 24743
  Historic Stations: 11165
  COOP Stations    : 1851
  ISD Stations     : 3293
  Tigerweb States  : 56
 
Wall time: 1.07 s
