# Modeling kelp site viability

In [49]:
import pandas as pd
import os, os.path
import urllib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import numpy as np

%matplotlib inline

In [50]:
dir_name = "../data"
pickles = '../data/pickles/'
gcs_bucket = "https://storage.googleapis.com/w210data/"


#if not os.path.isdir(dir_name):
#    os.mkdir(dir_name)
    
#def download_file(file_name):
#    full_path = os.path.join(dir_name, file_name)
#    if not os.path.exists(full_path):
#        urllib.request.urlretrieve (gcs_bucket+file_name, full_path)

#download_file(pickles+'grid_df_tenth.pkl')

# Read data

In [51]:
grid_df = pd.read_pickle(pickles+'grid_df_hundredth.pkl')
#keep polygon_id as identifier
polygon_id = grid_df.polygon_id
#extract only features relevant to predicting kelp viability
grid_df = grid_df[['biomass','depth','mean_sst','max_sst','min_sst','ndvi',\
                   'z_min_light','z_mixedl','floor_temp','viable','shoretype','shoretype2','aerial_kelp']]
#clean up categorical variables
grid_df.shoretype = grid_df.shoretype.apply(lambda x: x[0][1])
grid_df.shoretype2 = grid_df.shoretype2.apply(lambda x: x[0][1])
grid_df.aerial_kelp = grid_df.aerial_kelp.apply(lambda x: x[0][1])
print(grid_df.shape)
grid_df.head()

(19160, 13)


Unnamed: 0,biomass,depth,mean_sst,max_sst,min_sst,ndvi,z_min_light,z_mixedl,floor_temp,viable,shoretype,shoretype2,aerial_kelp
0,0.0,-38.4,11.929242,14.013989,9.125995,0.008434,34.0,17.955626,8.156,False,Rocky Shores,exposed rocky cliffs,False
1,0.0,-34.8,11.929242,14.013989,9.125995,0.008434,31.0,17.955626,8.156,False,Rocky Shores,exposed rocky cliffs,False
2,0.0,-31.0,11.929242,14.013989,9.125995,0.008434,31.0,17.955626,8.156,True,Rocky Shores,exposed rocky cliffs,False
3,0.0,-31.0,11.929242,14.013989,9.125995,0.008434,28.0,17.955626,8.156,False,Rocky Shores,exposed rocky cliffs,False
4,0.0,-30.4,11.929242,14.013989,9.125995,0.008434,28.0,17.955626,8.156,False,Rocky Shores,exposed rocky cliffs,False


# Create dummy variables for categorical variables

In [52]:
#create dummy variables for the categorical variables
shoretype = pd.get_dummies(grid_df['shoretype'])
shoretype2 = pd.get_dummies(grid_df['shoretype2'])

#drop categorical columns; will add dummy columns subsequently
grid_df.drop(['shoretype','shoretype2'], axis=1, inplace=True)

#add dummy variables for shoretype and shoretype2
grid_df = grid_df.join(shoretype).join(shoretype2)
print(grid_df.shape)
grid_df.head()

(19160, 36)


Unnamed: 0,biomass,depth,mean_sst,max_sst,min_sst,ndvi,z_min_light,z_mixedl,floor_temp,viable,...,riprap,salt and brackish water marshes,salt marshes,scarps and steep slopes in sand,sheltered man-made structures,sheltered riprap,sheltered rocky shores,sheltered tidal flats,vegetated low riverine banks,wave cut rocky platforms
0,0.0,-38.4,11.929242,14.013989,9.125995,0.008434,34.0,17.955626,8.156,False,...,0,0,0,0,0,0,0,0,0,0
1,0.0,-34.8,11.929242,14.013989,9.125995,0.008434,31.0,17.955626,8.156,False,...,0,0,0,0,0,0,0,0,0,0
2,0.0,-31.0,11.929242,14.013989,9.125995,0.008434,31.0,17.955626,8.156,True,...,0,0,0,0,0,0,0,0,0,0
3,0.0,-31.0,11.929242,14.013989,9.125995,0.008434,28.0,17.955626,8.156,False,...,0,0,0,0,0,0,0,0,0,0
4,0.0,-30.4,11.929242,14.013989,9.125995,0.008434,28.0,17.955626,8.156,False,...,0,0,0,0,0,0,0,0,0,0


# split data into train/dev/test sets

In [53]:
#separate features from output classes
target = 'aerial_kelp'
predictors = [c for c in grid_df.columns if not c == target]

discrete = ['shoretype', 'shoretype2']
continuous = [i for i in grid_df.columns if not i in discrete and i != target]

y = grid_df[target]
x = grid_df[predictors]

#create train/dev/test sets
x_train, x_dev, y_train, y_dev = train_test_split(x,y, test_size=0.3, shuffle=True)
x_dev, x_test, y_dev, y_test = train_test_split(x_dev,y_dev, test_size=0.33)
print(x_train.shape)
print(x_dev.shape)
print(x_test.shape)

(13412, 35)
(3851, 35)
(1897, 35)


# Scale numeric variables

In [54]:
sum(y_train)/len(y_train)

0.08962123471518044

In [55]:
ss = StandardScaler()
ss.fit(x_train[continuous])
x_train[continuous] = ss.transform(x_train[continuous])
x_dev[continuous] = ss.transform(x_dev[continuous])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [56]:
x_train.head()

Unnamed: 0,biomass,depth,mean_sst,max_sst,min_sst,ndvi,z_min_light,z_mixedl,floor_temp,viable,...,riprap,salt and brackish water marshes,salt marshes,scarps and steep slopes in sand,sheltered man-made structures,sheltered riprap,sheltered rocky shores,sheltered tidal flats,vegetated low riverine banks,wave cut rocky platforms
4572,-0.062927,0.50896,-0.843145,-0.798447,-0.822654,1.435505,-0.966721,0.370529,-0.662064,-0.270746,...,-0.308714,2.043619,-0.025913,-0.052596,-0.039601,-0.032325,-0.012212,-0.008635,-0.008635,-0.285069
2081,-0.062927,0.024351,-1.238055,-1.301573,-1.199421,1.435505,0.426524,0.552854,-0.349186,-0.270746,...,-0.308714,-0.489328,-0.025913,-0.052596,-0.039601,-0.032325,-0.012212,-0.008635,-0.008635,-0.285069
16503,-0.062927,0.490956,0.13052,0.084912,0.168887,-0.903879,-0.966721,-0.806286,0.40734,-0.270746,...,3.239246,-0.489328,-0.025913,-0.052596,-0.039601,-0.032325,-0.012212,-0.008635,-0.008635,-0.285069
12552,-0.062927,0.50896,-0.480707,-0.385685,-0.509696,-0.867663,-0.966721,-1.103616,-2.200783,-0.270746,...,-0.308714,2.043619,-0.025913,-0.052596,-0.039601,-0.032325,-0.012212,-0.008635,-0.008635,-0.285069
11109,-0.062927,0.507459,-0.657368,-0.574356,-0.67276,-0.867663,-0.966721,-1.103616,-2.200783,-0.270746,...,-0.308714,2.043619,-0.025913,-0.052596,-0.039601,-0.032325,-0.012212,-0.008635,-0.008635,-0.285069


# Logistic Regression model

In [57]:
lr = LogisticRegression()
lr.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [58]:
lr.score(x_dev,y_dev)

0.908595170085692

In [59]:
y_pred = lr.predict(x_dev)
#confusion_matrix(y_dev,y_pred)
cfnmatrix = pd.DataFrame(
            confusion_matrix(y_dev, y_pred), 
            columns=['Predicted=0', 'Predicted=1'], 
            index=['Actual=0', 'Actual=1']
)
cfnmatrix

Unnamed: 0,Predicted=0,Predicted=1
Actual=0,3461,13
Actual=1,339,38
