In [1]:
#!/usr/bin/env python
import pandas as pd
import numpy as np
import os
from datetime import datetime
from dateutil.parser import parse
import shapefile
import pygeoif

In [2]:
# working directory
working_dir = "./ProcessedInputData/"
os.chdir(working_dir)

In [3]:
target_columns = ["ReportingUnitUUID", "ReportingUnitNativeID", "ReportingUnitName", 
                  "ReportingUnitTypeCV", "ReportingUnitUpdateDate", "ReportingUnitProductVersion",
                  "StateCV", "EPSGCodeCV", "Geometry"]

In [4]:
outdf100 = pd.DataFrame(columns=target_columns)

In [5]:
# Input files
#Just one year of data is sufficient for 'reporting units as this is a dimenstion table'
fileInput1 = "CA-DWR-WaterBalance-Level2-DP-1000-2015-DAUCO.csv"

In [6]:
print("Reading inputs...")

df100 = pd.read_csv(fileInput1,encoding = "ISO-8859-1") #, usecols =input_owner_cols) 
print(len(df100))
#df100.drop_duplicates(inplace=True)
#print(len(df100))
df100

Reading inputs...
202329


Unnamed: 0,CategoryA,CategoryC,DAU,DAU_NAME,HR_CODE,HR_NAME,PA,Longitude,Latitude,KAcreFt,Year,CategoryD,CategoryB
0,Agriculture,Applied Water,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.639420,100.5,2015,AG1,1
1,Agriculture,Applied Water - Groundwater Recharge,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.639420,0.0,2015,AG2,2
2,Agriculture,Conveyance Deep Percolation,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.639420,0.0,2015,AG22,22
3,Agriculture,Conveyance Deep Percolation to Mexico,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.639420,0.0,2015,AG18F,18f
4,Agriculture,Conveyance Deep Percolation to Nevada,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.639420,0.0,2015,AG18E,18e
5,Agriculture,Conveyance Deep Percolation to Oregon,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.639420,0.0,2015,AG18D,18d
6,Agriculture,Conveyance Deep Percolation to Salt Sink,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.639420,0.0,2015,AG23,23
7,Agriculture,Conveyance Evaporation and ETAW,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.639420,0.0,2015,AG17,17
8,Agriculture,Conveyance Return Flow for Delta Outflow,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.639420,0.0,2015,AG19B,19b
9,Agriculture,Conveyance Return Flow to Developed Supply (Ot...,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.639420,0.0,2015,AG20A,20a


In [7]:
print("Take unique DAUs...")
#drop duplicate rows; just make sure
df100 = df100.drop_duplicates(subset = ['DAU'])   #
df100 = df100.reset_index(drop=True)

print(len(df100))
df100

Take unique DAUs...
485


Unnamed: 0,CategoryA,CategoryC,DAU,DAU_NAME,HR_CODE,HR_NAME,PA,Longitude,Latitude,KAcreFt,Year,CategoryD,CategoryB
0,Agriculture,Applied Water,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.639420,100.5,2015,AG1,1
1,Agriculture,Applied Water,DAU01949,Gualala,1,North Coast,103,-123.280350,38.663926,0.4,2015,AG1,1
2,Agriculture,Applied Water,DAU30336,Indian Wells,9,South Lahontan,902,-117.577346,35.706490,0.0,2015,AG1,1
3,Agriculture,Applied Water,DAU25316,Temblor,7,Tulare Lake,701,-120.173356,35.797765,0.0,2015,AG1,1
4,Agriculture,Applied Water,DAU29814,Panamint,9,South Lahontan,903,-117.377794,36.194401,0.0,2015,AG1,1
5,Agriculture,Applied Water,DAU24416,Westlands,7,Tulare Lake,702,-119.954043,36.167506,122.6,2015,AG1,1
6,Agriculture,Applied Water,DAU04241,San Mateo Coast,2,San Francisco Bay,202,-122.346153,37.378035,3.2,2015,AG1,1
7,Agriculture,Applied Water,DAU32736,Dale,10,Colorado River,1001,-115.910590,34.165188,0.2,2015,AG1,1
8,Agriculture,Applied Water,DAU15834,American River,5,Sacramento River,508,-121.134135,38.705049,0.0,2015,AG1,1
9,Agriculture,Applied Water,DAU21650,West Side,6,San Joaquin River,606,-121.141144,37.453185,403.9,2015,AG1,1


In [8]:
print ("point shapes from lat lon...")

def createPointWKTfromLatLon(lon1, lat1):
    lat = float(lat1)
    lon = float(lon1)
    psf = shapefile.Writer('dummyShape', shapefile.POINT)
    psf.field('pointN', 'C')
    psf.point(lon, lat)
    psf.record('point1')
    psf.close()
    
    with shapefile.Reader('dummyShape.shp') as sf:
        gm= pygeoif.geometry.as_shape(sf.shapes()[0])
        gmwkt = gm.wkt
    
    os.remove("dummyShape.shp")
    os.remove("dummyShape.dbf")
    os.remove("dummyShape.shx")
    
    return gmwkt


df100 = df100.assign(Geometry='')
df100['Geometry'] = df100.apply(lambda row: 
                                    createPointWKTfromLatLon(row["Longitude"], row["Latitude"]), axis=1)

df100

point shapes from lat lon...


Unnamed: 0,CategoryA,CategoryC,DAU,DAU_NAME,HR_CODE,HR_NAME,PA,Longitude,Latitude,KAcreFt,Year,CategoryD,CategoryB,Geometry
0,Agriculture,Applied Water,DAU04827,Pressure,3,Central Coast,301,-121.637711,36.639420,100.5,2015,AG1,1,POINT (-121.63771059999999 36.63941973)
1,Agriculture,Applied Water,DAU01949,Gualala,1,North Coast,103,-123.280350,38.663926,0.4,2015,AG1,1,POINT (-123.28035020000002 38.66392589)
2,Agriculture,Applied Water,DAU30336,Indian Wells,9,South Lahontan,902,-117.577346,35.706490,0.0,2015,AG1,1,POINT (-117.57734599999999 35.70648965)
3,Agriculture,Applied Water,DAU25316,Temblor,7,Tulare Lake,701,-120.173356,35.797765,0.0,2015,AG1,1,POINT (-120.1733558 35.79776478)
4,Agriculture,Applied Water,DAU29814,Panamint,9,South Lahontan,903,-117.377794,36.194401,0.0,2015,AG1,1,POINT (-117.37779429999999 36.19440062)
5,Agriculture,Applied Water,DAU24416,Westlands,7,Tulare Lake,702,-119.954043,36.167506,122.6,2015,AG1,1,POINT (-119.9540433 36.16750559)
6,Agriculture,Applied Water,DAU04241,San Mateo Coast,2,San Francisco Bay,202,-122.346153,37.378035,3.2,2015,AG1,1,POINT (-122.34615259999998 37.37803468)
7,Agriculture,Applied Water,DAU32736,Dale,10,Colorado River,1001,-115.910590,34.165188,0.2,2015,AG1,1,POINT (-115.9105895 34.16518813)
8,Agriculture,Applied Water,DAU15834,American River,5,Sacramento River,508,-121.134135,38.705049,0.0,2015,AG1,1,POINT (-121.13413529999998 38.70504926)
9,Agriculture,Applied Water,DAU21650,West Side,6,San Joaquin River,606,-121.141144,37.453185,403.9,2015,AG1,1,POINT (-121.141144 37.45318454)


In [9]:
print("Copying Columns...")

srsCols = ["DAU", "DAU_NAME", "Geometry"]
destCols = ["ReportingUnitNativeID", "ReportingUnitName", "Geometry"]

outdf100[destCols] = df100[srsCols]

Copying Columns...


In [10]:
print("ReportingUnitUUID...")

#ReportingUnitUUID	 	 NM_NativeID
outdf100['ReportingUnitUUID'] = outdf100.apply(lambda row: "_".join(["CA", str(row['ReportingUnitNativeID'])]), axis=1)


ReportingUnitUUID...


In [11]:
# hardcoded

outdf100.ReportingUnitTypeCV = "Detailed Analysis Unit by County (DAUCO)"
outdf100.StateCV = "CA"
outdf100.EPSGCodeCV = "EPSG:4326"
outdf100.ReportingUnitUpdateDate = datetime.now().strftime('%m/%d/%Y') 

#sampleWKT =  'POLYGON((-99.54319297853704 37.15853229006052, -97.26976797641987 37.15759429005948, -105.11636298372741 37.14764529005038, -104.52740598317905 37.15119229005359, -104.09963198278069 37.15376929005606, -103.56062798227867 37.156443290058405, -103.12301898187116 37.157137290059154, -103.08639398183686 37.15689329005886, -103.00203898175846 37.156332290058344, -99.90287697887197 37.162385290064094, -99.54319297853704 37.15853229006052))'
#outdf100.Geometry = sampleWKT

# replace NaN with blank cells
outdf100 = outdf100.replace(np.nan, '')
outdf100
#.head(5)

Unnamed: 0,ReportingUnitUUID,ReportingUnitNativeID,ReportingUnitName,ReportingUnitTypeCV,ReportingUnitUpdateDate,ReportingUnitProductVersion,StateCV,EPSGCodeCV,Geometry
0,CA_DAU04827,DAU04827,Pressure,Detailed Analysis Unit by County (DAUCO),01/02/2020,,CA,EPSG:4326,POINT (-121.63771059999999 36.63941973)
1,CA_DAU01949,DAU01949,Gualala,Detailed Analysis Unit by County (DAUCO),01/02/2020,,CA,EPSG:4326,POINT (-123.28035020000002 38.66392589)
2,CA_DAU30336,DAU30336,Indian Wells,Detailed Analysis Unit by County (DAUCO),01/02/2020,,CA,EPSG:4326,POINT (-117.57734599999999 35.70648965)
3,CA_DAU25316,DAU25316,Temblor,Detailed Analysis Unit by County (DAUCO),01/02/2020,,CA,EPSG:4326,POINT (-120.1733558 35.79776478)
4,CA_DAU29814,DAU29814,Panamint,Detailed Analysis Unit by County (DAUCO),01/02/2020,,CA,EPSG:4326,POINT (-117.37779429999999 36.19440062)
5,CA_DAU24416,DAU24416,Westlands,Detailed Analysis Unit by County (DAUCO),01/02/2020,,CA,EPSG:4326,POINT (-119.9540433 36.16750559)
6,CA_DAU04241,DAU04241,San Mateo Coast,Detailed Analysis Unit by County (DAUCO),01/02/2020,,CA,EPSG:4326,POINT (-122.34615259999998 37.37803468)
7,CA_DAU32736,DAU32736,Dale,Detailed Analysis Unit by County (DAUCO),01/02/2020,,CA,EPSG:4326,POINT (-115.9105895 34.16518813)
8,CA_DAU15834,DAU15834,American River,Detailed Analysis Unit by County (DAUCO),01/02/2020,,CA,EPSG:4326,POINT (-121.13413529999998 38.70504926)
9,CA_DAU21650,DAU21650,West Side,Detailed Analysis Unit by County (DAUCO),01/02/2020,,CA,EPSG:4326,POINT (-121.141144 37.45318454)


In [12]:
print("Droping duplicates...")
#drop duplicate rows; just make sure
outdf100Duplicated=outdf100.loc[outdf100.duplicated()]
if len(outdf100Duplicated.index) > 0:
    outdf100Duplicated.to_csv("reportingunits_duplicaterows.csv")  # index=False,
    outdf100.drop_duplicates(inplace=True)   #
    outdf100 = outdf100.reset_index(drop=True)

Droping duplicates...


In [14]:
print("Writing out...")

#write out
out_repunit = 'reportingunits.csv'
outdf100.to_csv(out_repunit, index=False, encoding = "utf-8")

print("Done Reporting units")

Writing out...
Done Reporting units
