# Combine data

In [1]:
import pandas as pd
import os
import numpy as np
from itertools import product
from shapely.geometry import LinearRing, Polygon, Point, LineString
import geopandas as gpd
import matplotlib.pyplot as plt
from math import radians, sin, cos, asin, sqrt, atan2
from time import time

%matplotlib inline

pickles = '../data/pickles/'

# Read Pickle Files

In [2]:
os.listdir(pickles)

['bathymetry.pkl',
 'ca_counties.pkl',
 'crit_species.pkl',
 'kelp_biomass.pkl',
 'mpa.pkl',
 'nes_occupation.pkl',
 'qcew_occupation.pkl',
 'sst_ca.pkl']

In [3]:
#we will need to convert some geometries from lat, lon to lon,lat
def flip_coords(geom):
    if type(geom) == Polygon:
        coords = geom.exterior.coords.xy 
        geomtype = 'Polygon'
    elif type(geom) == LineString:
        coords = geom.coords.xy
        geomtype = 'LineString'
    elif type(geom) == Point:
        coords = geom.coords.xy
        geomtype = 'Point'
    else:
        print("not a known geom type")
        return 
    flipped_coords = list(zip(coords[1],coords[0]))
    if geomtype == 'Polygon':
        return Polygon(flipped_coords) 
    elif geomtype == 'LineString':
        return LineString(flipped_coords)
    elif geomtype == 'Point':
        return Point(flipped_coords)

In [4]:
#Pickle files come from data/read_data.ipynb and
#bathymetry is not a gpd df
bathymetry = pd.read_pickle(pickles+'bathymetry.pkl')
bathymetry.head()

Unnamed: 0,x,y,z
0,-128.0,37.0,
1,-128.0,37.000833,
2,-128.0,37.001667,
3,-128.0,37.0025,
4,-128.0,37.003333,


In [36]:
#in lon, lat format
counties = pd.read_pickle(pickles+'ca_counties.pkl')
counties.head()

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,geometry
0,6,75,277302,0500000US06075,6075,San Francisco,6,121485107,479107241,"POLYGON ((-122.511983 37.77113, -122.465396 37..."
705,6,87,277308,0500000US06087,6087,Santa Cruz,6,1152967426,419570389,"POLYGON ((-122.317682 37.186945, -122.152774 3..."
713,6,31,277280,0500000US06031,6031,Kings,6,3598593366,5453080,"POLYGON ((-120.315068 35.907186, -119.959058 3..."
899,6,85,277307,0500000US06085,6085,Santa Clara,6,3344209919,33276429,"POLYGON ((-122.190402 37.431472, -122.144933 3..."
915,6,21,277275,0500000US06021,6021,Glenn,6,3403149421,33704225,"POLYGON ((-122.911794 39.799485, -122.046471 3..."


In [45]:
#in lon lat format except for leatherbacks which is all messed up
crit_species = pd.read_pickle(pickles+'crit_species.pkl')
crit_species.head()

Unnamed: 0,species,geometry
0,black abalone,"POLYGON ((-123.5053264870277 38.7405412355896,..."
1,black abalone,POLYGON ((-123.0726524775298 38.31072869617242...
2,black abalone,POLYGON ((-122.5006060204515 37.50355915762907...
3,black abalone,POLYGON ((-122.5097393606829 37.50326748590578...
4,black abalone,POLYGON ((-122.4114904047584 37.26741997175435...


In [7]:
#in lat lon format
protected_areas = pd.read_pickle(pickles+'mpa.pkl')
protected_areas.Geometry = protected_areas.Geometry.apply(flip_coords)
protected_areas = gpd.GeoDataFrame(protected_areas, crs = {'init' :'epsg:4326'}, geometry='Geometry')

In [8]:
#does not have geom. just county names (and fips state and county codes)
occupation_nes = pd.read_pickle(pickles+'nes_occupation.pkl')
occupation_nes.head()

Unnamed: 0,estab,estab_f,naics,year,State,fips_state,fips_county,name,STATE,COUNTY,STNAME,CTYNAME,population,pct
0,42,,11411,2010,CA,6,1,Alameda County,6,1,California,Alameda County,1513402,2.8e-05
1,71,,11411,2011,CA,6,1,Alameda County,6,1,California,Alameda County,1532215,4.6e-05
2,63,,11411,2012,CA,6,1,Alameda County,6,1,California,Alameda County,1556648,4e-05
3,65,,11411,2013,CA,6,1,Alameda County,6,1,California,Alameda County,1582936,4.1e-05
4,64,,11411,2014,CA,6,1,Alameda County,6,1,California,Alameda County,1611572,4e-05


In [9]:
#does not have geom. just county names (and fips state and county codes)
occupation_qcew = pd.read_pickle(pickles+'qcew_occupation.pkl')
occupation_qcew.head()

Unnamed: 0,area_fips,industry_code,agglvl_code,size_code,year,disclosure_code,annual_avg_estabs,annual_avg_emplvl,fips_county,fips_state,State,name,fips_class_code,STATE,COUNTY,STNAME,CTYNAME,population,estab_pct,emp_pct
0,6007,11251,77,0,2013,N,1,0,7,6,CA,Butte County,H1,6,7,California,Butte County,221768,5e-06,0.0
1,6015,11251,77,0,2013,N,1,0,15,6,CA,Del Norte County,H1,6,15,California,Del Norte County,27830,3.6e-05,0.0
2,6015,11411,77,0,2013,,39,81,15,6,CA,Del Norte County,H1,6,15,California,Del Norte County,27830,0.001401,0.002911
3,6019,11251,77,0,2013,N,4,0,19,6,CA,Fresno County,H1,6,19,California,Fresno County,953787,4e-06,0.0
4,6023,11251,77,0,2013,,6,19,23,6,CA,Humboldt County,H1,6,23,California,Humboldt County,134444,4.5e-05,0.000141


In [10]:
#in lat lon format
kelp_biomass =pd.read_pickle(pickles+'kelp_biomass.pkl')
kelp_biomass.head()

Unnamed: 0,lat,lon,biomass,geometry
0,32.519277,-117.142836,265.0,POINT (32.51927713 -117.1428356)
1,32.519547,-117.143155,635.0,POINT (32.5195474 -117.1431554)
2,32.519548,-117.142836,275.0,POINT (32.51954776 -117.142836)
3,32.519548,-117.142197,272.0,POINT (32.51954849 -117.1421972)
4,32.519818,-117.143156,565.0,POINT (32.51981803 -117.1431559)


In [11]:
#will be converted to lon lat format
sst = pd.read_pickle(pickles+'sst_ca.pkl')
lons = list(sst.lon)
lats = list(sst.lat)
coords = list(zip(lons,lats))
sst['geom'] = [Point(i) for i in coords]
sst = gpd.GeoDataFrame(sst, crs = {'init' :'epsg:4326'}, geometry='geom')

# Create grid of California

In [12]:
def squarify(point, increment):
    return Polygon([(point[0], point[1]),
     (point[0] + increment, point[1]), 
     (point[0] + increment, point[1] + increment), 
     (point[0], point[1] + increment)])

def desquarify(left_vertices, right_vertices, square_length):
    x_num = round((right_vertices[0] - left_vertices[0])/square_length, 0)
    y_num = round((right_vertices[1] - left_vertices[1])/square_length, 0)
    x_lengths = np.linspace(left_vertices[0], right_vertices[0]-square_length, x_num)
    y_lengths = np.linspace(left_vertices[1], right_vertices[1]-square_length, y_num)
    return [p for p in product(x_lengths, y_lengths)]

In [53]:
#create vertices for California
#roughly the bounds of california going out 8km past cape mendocino
width = .1
#(32.534343,-124.511849),(42.00,-117.123885)
grid_bottomleft_vertices = desquarify((-124.511849,32.534343),(-117.123885,42), width)
grid = [squarify(i, width) for i in grid_bottomleft_vertices]
grid_df = gpd.GeoDataFrame(geometry=grid, crs = {'init' : 'epsg:4326'})
grid_df["polygon_id"] = range(len(grid))
print(len(grid))
grid_df.head()

7030


  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,geometry,polygon_id
0,"POLYGON ((-124.511849 32.534343, -124.411849 3...",0
1,"POLYGON ((-124.511849 32.63397764893617, -124....",1
2,"POLYGON ((-124.511849 32.73361229787234, -124....",2
3,"POLYGON ((-124.511849 32.83324694680851, -124....",3
4,"POLYGON ((-124.511849 32.93288159574468, -124....",4


# Combine Data

In [16]:
def haversine(p1,p2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees).
    Source: https://gis.stackexchange.com/a/56589/15183
    """
    lon1 = p1[0]
    lat1 = p1[1]
    lon2 = p2[0]
    lat2 = p2[1]
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    km = 6373 * c
    return km

In [23]:
#just get the base distance for one point b/c this will be close to the the right scaling
#add half a degee in lat and lon to get the second point
def get_distance_ratio(shape):
    p1 = list(shape.representative_point().coords)[0]
    p2 = (p1[0]+0.5,p1[1]+0.5)
    d_km = haversine(p1,p2)
    #just get the base distance for one point b/c this will be close to the the right scaling for all p2s
    base_coord_d=Point(p1).distance(Point(p2))
    return d_km/base_coord_d

In [55]:
def get_closest_areas(shape,areas, area_names,max_distance, min_returned=3):
    distance_ratio = get_distance_ratio(shape)
    distances = [shape.distance(i)*distance_ratio for i in areas]
    sorted_distances = np.sort(distances)
    areas_within_max = sorted_distances < max_distance
    num_close_areas = sum(areas_within_max)
    if num_close_areas < min_returned:
        lowest_distances = sorted_distances[:min_returned]
        area_idx = np.argsort(distances)[:min_returned]
    else:
        lowest_distances = sorted_distances[:num_close_areas]
        area_idx = np.argsort(distances)[:num_close_areas]
    
    names = area_names.iloc[area_idx]
    return list(zip(area_idx, names, lowest_distances))

In [17]:
#distance in km does seem to vary for the same difference in latitude/long depending on your latitude.
print(haversine((-122,32.5),(-121,32.5)))
print(haversine((-122,32.5),(-122,33.5)))
print(haversine((-122,42),(-121,42)))
print(haversine((-122,42),(-122,41)))

93.80994612038263
111.22983322959878
82.65940525349976
111.22983322959878


In [18]:
protected_areas.head()


Unnamed: 0,Name,Geometry
0,Pyramid Point State Marine Conservation Area,"POLYGON ((-124.21225 42, -124.330233 42, -124...."
1,Point St. George Reef Offshore State Marine Co...,"POLYGON ((-124.3864833 41.866667, -124.4300833..."
2,Southwest Seal Rock Special Closure,POINT (-124.35165 41.8136)
3,Castle Rock Special Closure,POINT (-124.24915 41.761767)
4,False Klamath Rock Special Closure,POINT (-124.11165 41.593883)


In [19]:
#find the distance between one geometry and a series of other geometries, presumably from a dataset
grid_df['protected_areas'] = grid_df.geometry.apply(lambda x: [x.distance(i) for i in protected_areas.Geometry])
grid_df.head()

Unnamed: 0,geometry,polygon_id,protected_areas
0,"POLYGON ((-124.511849 32.534343, -124.411849 3...",0,"[9.324852592739687, 9.182324, 9.17945439509615..."
1,"POLYGON ((-124.511849 32.63397764893617, -124....",1,"[9.225227260015009, 9.082689351063834, 9.07982..."
2,"POLYGON ((-124.511849 32.73361229787234, -124....",2,"[9.125602130712315, 8.983054702127658, 8.98018..."
3,"POLYGON ((-124.511849 32.83324694680851, -124....",3,"[9.025977211567488, 8.883420053191491, 8.88055..."
4,"POLYGON ((-124.511849 32.93288159574468, -124....",4,"[8.926352509617086, 8.783785404255317, 8.78092..."


In [34]:
grid_df['pretected_areas'] = grid_df.geometry.apply(lambda x: get_closest_areas(x,protected_areas.Geometry,\
                                                                                protected_areas.Name,\
                                                                                10))
grid_df['county'] = grid_df.geometry.apply(lambda x: get_closest_areas(x,counties.geometry,\
                                                                                counties.NAME,\
                                                                                10))
grid_df.head()

Unnamed: 0,geometry,polygon_id,pretected_areas
0,"POLYGON ((-124.511849 32.534343, -124.411849 3...",0,"[(Naples State Marine Conservation Area, 398.4..."
1,"POLYGON ((-124.511849 32.63397764893617, -124....",1,"[(Naples State Marine Conservation Area, 393.6..."
2,"POLYGON ((-124.511849 32.73361229787234, -124....",2,"[(Naples State Marine Conservation Area, 389.1..."
3,"POLYGON ((-124.511849 32.83324694680851, -124....",3,"[(Naples State Marine Conservation Area, 384.7..."
4,"POLYGON ((-124.511849 32.93288159574468, -124....",4,"[(Naples State Marine Conservation Area, 380.6..."


In [56]:
grid_df['crit_species_habitat'] = grid_df.geometry.apply(lambda x: get_closest_areas(x,counties.geometry,\
                                                                                counties.NAME,\
                                                                                10))
grid_df.head()

Unnamed: 0,geometry,polygon_id,crit_species_habitat
0,"POLYGON ((-124.511849 32.534343, -124.411849 3...",0,"[(35, Santa Barbara, 431.04802332908645), (55,..."
1,"POLYGON ((-124.511849 32.63397764893617, -124....",1,"[(35, Santa Barbara, 427.55831003243003), (55,..."
2,"POLYGON ((-124.511849 32.73361229787234, -124....",2,"[(35, Santa Barbara, 424.2922921562878), (55, ..."
3,"POLYGON ((-124.511849 32.83324694680851, -124....",3,"[(35, Santa Barbara, 421.25449400615514), (55,..."
4,"POLYGON ((-124.511849 32.93288159574468, -124....",4,"[(35, Santa Barbara, 417.5239574532916), (55, ..."
