## Step 10: Gather Source Datasets

In [1]:
import arcpy,os,http.client,json,requests;

# Verify or Create Source filegeodatabase
fgdb = os.getcwd() + os.sep + 'source.gdb';

if not arcpy.Exists(fgdb):
   arcpy.CreateFileGDB_management(
       os.path.dirname(fgdb)
      ,os.path.basename(fgdb)
   );

arcpy.Exists(fgdb)


True

In [2]:
def scrape_ags(host,path,fgdb,fc):
    
    if arcpy.Exists(fgdb + os.sep + fc):
        arcpy.Delete_management(fgdb + os.sep + fc);
    headers = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain"};
    conn = http.client.HTTPSConnection(host);
    conn.request("POST",path,"f=json",headers);
    response = conn.getresponse();
    data = response.read();
    json_data = json.loads(data);
    if not 'currentVersion' in json_data:
        raise ValueError("Error, unable to query https://" + host + path);
    extraction_amount = json_data['maxRecordCount'];
    where = "1=1";
    params = "where={}&returnIdsOnly=true&returnGeometry=false&f=json".format(where);
    conn = http.client.HTTPSConnection(host);
    conn.request("POST",path + "/query",params,headers);
    response = conn.getresponse();
    data = response.read();
    json_data = json.loads(data);
    ary_oid   = sorted(json_data['objectIds']);
    oid_name  = json_data['objectIdFieldName'];
    oid_count = len(ary_oid);
    initial_hit = True;
    counter = 0;
    while counter <= oid_count - 1:
        if counter + extraction_amount > oid_count - 1:
            int_max = oid_count - 1;
        else:
            int_max = counter + extraction_amount - 1;
        where = oid_name + ' >= ' + str(ary_oid[counter]) + ' AND ' + oid_name + ' <= ' + str(ary_oid[int_max]);
        print("  pulling records where " + where);
        fields = "*";
        params = "where={}&outFields={}&returnGeometry=true&f=json".format(where, fields);
        conn = http.client.HTTPSConnection(host);
        conn.request("POST",path + "/query",params,headers);
        response = conn.getresponse();
        data = response.read(); 
        json_data = json.loads(data);
        ef = arcpy.AsShape(json_data,True);
        if initial_hit:
            arcpy.management.CopyFeatures(ef,fgdb + os.sep + fc)
            initial_hit = False;
        else:
            arcpy.Append_management(ef,fgdb + os.sep + fc,"NO_TEST");
        counter += extraction_amount;
    conn.close(); 
    del conn;
    print("  Scrape complete.");
    return True;


### Download CRWU_CREAT_Grid_Projections from EPA Geoplatform

In [3]:
host = "services.arcgis.com";
path = "/cJ9YHowT8TU7DUyn/ArcGIS/rest/services/CRWU_CREAT_Grid_Projections/FeatureServer/0";
fc   = "CRWU_CREAT_Grid_Projections";

if arcpy.Exists(fgdb + os.sep + fc):
    arcpy.Delete_management(fgdb + os.sep + fc);

z = scrape_ags(host,path,fgdb,fc);

arcpy.management.AddIndex(
     in_table   = fgdb + os.sep + fc
    ,fields     = 'CREAT_ID'
    ,index_name = 'CREAT_ID_IDX'
);

arcpy.management.AddIndex(
     in_table   = fgdb + os.sep + fc
    ,fields     = 'GRIDCODE'
    ,index_name = 'GRIDCODE_IDX'
);

print("DONE");


  pulling records where OBJECTID >= 1 AND OBJECTID <= 2000
  pulling records where OBJECTID >= 2001 AND OBJECTID <= 4000
  pulling records where OBJECTID >= 4001 AND OBJECTID <= 6000
  pulling records where OBJECTID >= 6001 AND OBJECTID <= 8000
  pulling records where OBJECTID >= 8001 AND OBJECTID <= 10000
  pulling records where OBJECTID >= 10001 AND OBJECTID <= 12000
  pulling records where OBJECTID >= 12001 AND OBJECTID <= 14000
  pulling records where OBJECTID >= 14001 AND OBJECTID <= 16000
  pulling records where OBJECTID >= 16001 AND OBJECTID <= 18000
  pulling records where OBJECTID >= 18001 AND OBJECTID <= 20000
  pulling records where OBJECTID >= 20001 AND OBJECTID <= 22000
  pulling records where OBJECTID >= 22001 AND OBJECTID <= 24000
  pulling records where OBJECTID >= 24001 AND OBJECTID <= 24743
  Scrape complete.
DONE


### Download CRWU_CREAT_Historic_Climate_Stations from EPA Geoplatform

In [4]:
host = "services.arcgis.com";
path = "/cJ9YHowT8TU7DUyn/ArcGIS/rest/services/CRWU_CREAT_Historic_Climate_Stations/FeatureServer/0";
fc   = "CRWU_CREAT_Historic_Climate_Stations";

if arcpy.Exists(fgdb + os.sep + fc):
    arcpy.Delete_management(fgdb + os.sep + fc);

z = scrape_ags(host,path,fgdb,fc);

arcpy.management.AddIndex(
     in_table   = fgdb + os.sep + fc
    ,fields     = 'CLIMATE_STATION_PK_ID'
    ,index_name = 'CLIMATE_STATION_PK_ID_IDX'
);

arcpy.management.AddIndex(
     in_table   = fgdb + os.sep + fc
    ,fields     = 'NOAA_STATION_ID'
    ,index_name = 'NOAA_STATION_ID_IDX'
);

print("DONE");


  pulling records where OBJECTID >= 1 AND OBJECTID <= 2000
  pulling records where OBJECTID >= 2001 AND OBJECTID <= 4000
  pulling records where OBJECTID >= 4001 AND OBJECTID <= 6000
  pulling records where OBJECTID >= 6001 AND OBJECTID <= 8000
  pulling records where OBJECTID >= 8001 AND OBJECTID <= 10000
  pulling records where OBJECTID >= 10001 AND OBJECTID <= 11165
  Scrape complete.
DONE


In [5]:
def downloadtab(url,filename):
    if arcpy.Exists(filename):
        arcpy.Delete_management(filename);
    print("  downloading file");
    with open(filename,'wb') as f,requests.get(url,stream=True) as r:
        for line in r.iter_lines():
            f.write(line + '\n'.encode());
    return True;
    
def tab2fc(filename,fgdb,fc,longname,latname,field_mapping=None):
    
    if arcpy.Exists('memory' + os.sep + 'tempTable'):
        arcpy.Delete_management('memory' + os.sep + 'tempTable');
  
    if arcpy.Exists(fgdb + os.sep + fc):
        arcpy.Delete_management(fgdb + os.sep + fc);
    
    print("  loading to table");
    arcpy.TableToTable_conversion(
         in_rows       = filename
        ,out_path      = 'memory'
        ,out_name      = 'tempTable'
        ,field_mapping = field_mapping
    );
    print("  converting to NAD83 points");
    arcpy.management.XYTableToPoint(
         in_table          = 'memory' + os.sep + 'tempTable'
        ,out_feature_class = fgdb + os.sep + fc
        ,x_field           = longname
        ,y_field           = latname
        ,coordinate_system = arcpy.SpatialReference(4269)
    );
    arcpy.Delete_management('memory' + os.sep + 'tempTable');
    return True;

def fmtext(infc,fieldname,fieldlength):
    fm = arcpy.FieldMap();
    fm.addInputField(infc,fieldname);
    nf = fm.outputField;
    nf.type = 'Text';
    nf.length = fieldlength;
    fm.outputField = nf;
    return fm;

def fmint(infc,fieldname):
    fm = arcpy.FieldMap();
    fm.addInputField(infc,fieldname);
    nf = fm.outputField;
    nf.type = 'Integer';
    fm.outputField = nf;
    return fm;

def fmdouble(infc,fieldname):
    fm = arcpy.FieldMap();
    fm.addInputField(infc,fieldname);
    nf = fm.outputField;
    nf.type = 'Double';
    fm.outputField = nf;
    return fm;
    

### Download COOP_STATIONS_TO_USE dataset

In [6]:
url = "https://raw.githubusercontent.com/barrc/get_ncei/master/src/coop_stations_to_use.csv"
fc  = 'COOP_STATIONS_TO_USE';

tmptab = arcpy.env.scratchFolder + os.sep + 'tempTable.csv';
z = downloadtab(url,tmptab);
  
fms = arcpy.FieldMappings();
fms.addFieldMap(fmtext  (tmptab,'station_id',255));
fms.addFieldMap(fmtext  (tmptab,'station_name',255));
fms.addFieldMap(fmtext  (tmptab,'state',255));
fms.addFieldMap(fmtext  (tmptab,'start_date',255));
fms.addFieldMap(fmtext  (tmptab,'end_date',255));
fms.addFieldMap(fmdouble(tmptab,'latitude'));
fms.addFieldMap(fmdouble(tmptab,'longitude'));
fms.addFieldMap(fmtext  (tmptab,'in_basins',255));
fms.addFieldMap(fmtext  (tmptab,'break_with_basins',255));
fms.addFieldMap(fmtext  (tmptab,'network',255));
fms.addFieldMap(fmtext  (tmptab,'start_date_to_use',255));
fms.addFieldMap(fmtext  (tmptab,'end_date_to_use',255));

z = tab2fc(tmptab,fgdb,fc,'longitude','latitude',fms);

print("  add quotes to start and end fields");
cb_cleanDate = """
def cleanDate(pin):
    (mm,dd,yyyy) = pin.split('/');
    if mm in ['1','2','3','4','5','6','7','8','9']:
       mm = '0' + mm;
    if dd in ['1','2','3','4','5','6','7','8','9']:
       dd = '0' + dd;
    return "'" + yyyy + "/" + mm + "/" + dd + "'";
    
""";

arcpy.management.AddField(
     in_table     = fgdb + os.sep + fc
    ,field_name   = 'start_date_clean'
    ,field_type   = 'Text'
    ,field_length = 255
    ,field_alias  = 'start_date_clean'
);

arcpy.management.CalculateField(
     in_table        = fgdb + os.sep + fc
    ,field           = 'start_date_clean'
    ,expression      = "cleanDate(!start_date!)"
    ,expression_type = 'PYTHON3'
    ,code_block      = cb_cleanDate
);

arcpy.management.AddField(
     in_table     = fgdb + os.sep + fc
    ,field_name   = 'end_date_clean'
    ,field_type   = 'Text'
    ,field_length = 255
    ,field_alias  = 'end_date_clean'
);

arcpy.management.CalculateField(
     in_table        = fgdb + os.sep + fc
    ,field           = 'end_date_clean'
    ,expression      = "cleanDate(!end_date!)"
    ,expression_type = 'PYTHON3'
    ,code_block      = cb_cleanDate
);

arcpy.management.AddField(
     in_table     = fgdb + os.sep + fc
    ,field_name   = 'start_date_to_use_clean'
    ,field_type   = 'Text'
    ,field_length = 255
    ,field_alias  = 'start_date_to_use_clean'
);

arcpy.management.CalculateField(
     in_table        = fgdb + os.sep + fc
    ,field           = 'start_date_to_use_clean'
    ,expression      = "cleanDate(!start_date_to_use!)"
    ,expression_type = 'PYTHON3'
    ,code_block      = cb_cleanDate
);

arcpy.management.AddField(
     in_table     = fgdb + os.sep + fc
    ,field_name   = 'end_date_to_use_clean'
    ,field_type   = 'Text'
    ,field_length = 255
    ,field_alias  = 'end_date_to_use_clean'
);

arcpy.management.CalculateField(
     in_table        = fgdb + os.sep + fc
    ,field           = 'end_date_to_use_clean'
    ,expression      = "cleanDate(!end_date_to_use!)"
    ,expression_type = 'PYTHON3'
    ,code_block      = cb_cleanDate
);

print("  calculating year count");
cb_yearCount = """
import datetime;
def yearCount(pstart,pend):
    d1 = datetime.datetime.strptime(pstart,"%m/%d/%Y");
    d2 = datetime.datetime.strptime(pend  ,"%m/%d/%Y");
    yr = round((d2 - d1).days / 365);
    return yr + 0.0;
    
""";

arcpy.management.AddField(
     in_table     = fgdb + os.sep + fc
    ,field_name   = 'year_count'
    ,field_type   = 'Double'
    ,field_alias  = 'year_count'
);

arcpy.management.CalculateField(
     in_table        = fgdb + os.sep + fc
    ,field           = 'year_count'
    ,expression      = 'yearCount(!start_date_to_use!,!end_date_to_use!)'
    ,expression_type = 'PYTHON3'
    ,code_block      = cb_yearCount
);

print("  adding indexes");
arcpy.management.AddIndex(
     in_table      = fgdb + os.sep + fc
    ,fields        = 'station_id'
    ,index_name    = 'station_id_IDX'
);

print("DONE");


  downloading file
  loading to table
  converting to NAD83 points
  add quotes to start and end fields
  calculating year count
  adding indexes
DONE


### Download ISD_STATIONS_TO_USE dataset

In [7]:
url = "https://raw.githubusercontent.com/barrc/get_ncei/master/src/isd_stations_to_use.csv"
fc  = 'ISD_STATIONS_TO_USE';

tmptab = arcpy.env.scratchFolder + os.sep + 'tempTable.csv';
z = downloadtab(url,tmptab);

fms = arcpy.FieldMappings();
fms.addFieldMap(fmtext  (tmptab,'station_id',255));
fms.addFieldMap(fmtext  (tmptab,'station_name',255));
fms.addFieldMap(fmtext  (tmptab,'state',255));
fms.addFieldMap(fmtext  (tmptab,'start_date',255));
fms.addFieldMap(fmtext  (tmptab,'end_date',255));
fms.addFieldMap(fmdouble(tmptab,'latitude'));
fms.addFieldMap(fmdouble(tmptab,'longitude'));
fms.addFieldMap(fmtext  (tmptab,'in_basins',255));
fms.addFieldMap(fmtext  (tmptab,'break_with_basins',255));
fms.addFieldMap(fmtext  (tmptab,'network',255));
    
z = tab2fc(tmptab,fgdb,fc,'longitude','latitude',fms);

print("  add quotes to start and end fields");
arcpy.management.AddField(
     in_table     = fgdb + os.sep + fc
    ,field_name   = 'start_date_clean'
    ,field_type   = 'Text'
    ,field_length = 255
    ,field_alias  = 'start_date_clean'
);

arcpy.management.CalculateField(
     in_table        = fgdb + os.sep + fc
    ,field           = 'start_date_clean'
    ,expression      = "cleanDate(!start_date!)"
    ,expression_type = 'PYTHON3'
    ,code_block      = cb_cleanDate
);

arcpy.management.AddField(
     in_table     = fgdb + os.sep + fc
    ,field_name   = 'end_date_clean'
    ,field_type   = 'Text'
    ,field_length = 255
    ,field_alias  = 'end_date_clean'
);

arcpy.management.CalculateField(
     in_table        = fgdb + os.sep + fc
    ,field           = 'end_date_clean'
    ,expression      = "cleanDate(!end_date!)"
    ,expression_type = 'PYTHON3'
    ,code_block      = cb_cleanDate
);

print("  calculating year count");
arcpy.management.AddField(
     in_table     = fgdb + os.sep + fc
    ,field_name   = 'year_count'
    ,field_type   = 'Double'
    ,field_alias  = 'year_count'
);

arcpy.management.CalculateField(
     in_table        = fgdb + os.sep + fc
    ,field           = 'year_count'
    ,expression      = 'yearCount(!start_date!,!end_date!)'
    ,expression_type = 'PYTHON3'
    ,code_block      = cb_yearCount
);

print("  adding indexes");
arcpy.management.AddIndex(
     in_table   = fgdb + os.sep + fc
    ,fields     = 'station_id'
    ,index_name = 'station_id_IDX'
);

print("DONE");


  downloading file
  loading to table
  converting to NAD83 points
  add quotes to start and end fields
  calculating year count
  adding indexes
DONE
