# Modeling kelp site viability

In [1]:
import pandas as pd
import os, os.path
import urllib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import numpy as np

%matplotlib inline

In [2]:
dir_name = "../data"
pickles = '../data/pickles/'
gcs_bucket = "https://storage.googleapis.com/w210data/"


#if not os.path.isdir(dir_name):
#    os.mkdir(dir_name)
    
#def download_file(file_name):
#    full_path = os.path.join(dir_name, file_name)
#    if not os.path.exists(full_path):
#        urllib.request.urlretrieve (gcs_bucket+file_name, full_path)

#download_file(pickles+'grid_df_tenth.pkl')

# Read data

In [3]:
grid_df = pd.read_pickle(pickles+'grid_df_hundredth.pkl')
#keep polygon_id as identifier
polygon_id = grid_df.polygon_id
#extract only features relevant to predicting kelp viability
grid_df = grid_df[['biomass','depth','mean_sst','max_sst','min_sst','ndvi',\
                   'z_min_light','z_mixedl','floor_temp','viable','shoretype','shoretype2','aerial_kelp']]
#clean up categorical variables
grid_df.shoretype = grid_df.shoretype.apply(lambda x: x[0][1])
grid_df.shoretype2 = grid_df.shoretype2.apply(lambda x: x[0][1])
grid_df.aerial_kelp = grid_df.aerial_kelp.apply(lambda x: x[0][1])
print(grid_df.shape)
grid_df.head()

(19160, 13)


Unnamed: 0,biomass,depth,mean_sst,max_sst,min_sst,ndvi,z_min_light,z_mixedl,floor_temp,viable,shoretype,shoretype2,aerial_kelp
0,0.0,-38.4,11.929242,14.013989,9.125995,-0.003885,34.0,17.955626,8.156,False,Rocky Shores,exposed rocky cliffs,False
1,0.0,-34.8,11.929242,14.013989,9.125995,-0.003885,31.0,17.955626,8.156,False,Rocky Shores,exposed rocky cliffs,False
2,0.0,-31.0,11.929242,14.013989,9.125995,-0.003885,31.0,17.955626,8.156,True,Rocky Shores,exposed rocky cliffs,False
3,0.0,-31.0,11.929242,14.013989,9.125995,-0.003885,28.0,17.955626,8.156,False,Rocky Shores,exposed rocky cliffs,False
4,0.0,-30.4,11.929242,14.013989,9.125995,-0.003885,28.0,17.955626,8.156,False,Rocky Shores,exposed rocky cliffs,False


# Create dummy variables for categorical variables

In [4]:
#create dummy variables for the categorical variables
shoretype = pd.get_dummies(grid_df['shoretype'])
shoretype2 = pd.get_dummies(grid_df['shoretype2'])

#drop categorical columns; will add dummy columns subsequently
grid_df.drop(['shoretype','shoretype2'], axis=1, inplace=True)

#add dummy variables for shoretype and shoretype2
grid_df = grid_df.join(shoretype).join(shoretype2)
print(grid_df.shape)
grid_df.head()

(19160, 36)


Unnamed: 0,biomass,depth,mean_sst,max_sst,min_sst,ndvi,z_min_light,z_mixedl,floor_temp,viable,...,riprap,salt and brackish water marshes,salt marshes,scarps and steep slopes in sand,sheltered man-made structures,sheltered riprap,sheltered rocky shores,sheltered tidal flats,vegetated low riverine banks,wave cut rocky platforms
0,0.0,-38.4,11.929242,14.013989,9.125995,-0.003885,34.0,17.955626,8.156,False,...,0,0,0,0,0,0,0,0,0,0
1,0.0,-34.8,11.929242,14.013989,9.125995,-0.003885,31.0,17.955626,8.156,False,...,0,0,0,0,0,0,0,0,0,0
2,0.0,-31.0,11.929242,14.013989,9.125995,-0.003885,31.0,17.955626,8.156,True,...,0,0,0,0,0,0,0,0,0,0
3,0.0,-31.0,11.929242,14.013989,9.125995,-0.003885,28.0,17.955626,8.156,False,...,0,0,0,0,0,0,0,0,0,0
4,0.0,-30.4,11.929242,14.013989,9.125995,-0.003885,28.0,17.955626,8.156,False,...,0,0,0,0,0,0,0,0,0,0


# split data into train/dev/test sets

In [5]:
#separate features from output classes
target = 'aerial_kelp'
predictors = [c for c in grid_df.columns if not c == target]

discrete = ['shoretype', 'shoretype2']
continuous = [i for i in grid_df.columns if not i in discrete and i != target]

y = grid_df[target]
x = grid_df[predictors]

#create train/dev/test sets
x_train, x_dev, y_train, y_dev = train_test_split(x,y, test_size=0.3, shuffle=True)
x_dev, x_test, y_dev, y_test = train_test_split(x_dev,y_dev, test_size=0.33)
print(x_train.shape)
print(x_dev.shape)
print(x_test.shape)

(13412, 35)
(3851, 35)
(1897, 35)


# Scale numeric variables

In [6]:
sum(y_train)/len(y_train)

0.09081419624217119

In [7]:
ss = StandardScaler()
ss.fit(x_train[continuous])
x_train[continuous] = ss.transform(x_train[continuous])
x_dev[continuous] = ss.transform(x_dev[continuous])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [8]:
x_train.head()

Unnamed: 0,biomass,depth,mean_sst,max_sst,min_sst,ndvi,z_min_light,z_mixedl,floor_temp,viable,...,riprap,salt and brackish water marshes,salt marshes,scarps and steep slopes in sand,sheltered man-made structures,sheltered riprap,sheltered rocky shores,sheltered tidal flats,vegetated low riverine banks,wave cut rocky platforms
6624,-0.063564,-1.353369,-0.169381,-0.314347,-0.057676,-0.73291,0.901598,0.94569,0.336457,-0.271063,...,-0.31304,-0.484641,-0.02443,-0.052596,-0.040534,-0.031148,-0.008635,-0.008635,0.0,-0.28629
15935,-0.063564,-0.449906,1.630215,1.733939,1.854256,-0.286502,3.735721,1.061559,0.619335,-0.271063,...,-0.31304,-0.484641,-0.02443,-0.052596,-0.040534,-0.031148,-0.008635,-0.008635,0.0,3.492963
8666,-0.063564,0.188184,0.247348,0.177244,0.066274,-0.459951,-0.02901,1.006945,0.744181,-0.271063,...,-0.31304,-0.484641,-0.02443,-0.052596,-0.040534,-0.031148,-0.008635,-0.008635,0.0,-0.28629
7400,-0.063564,0.136004,0.029663,0.068483,0.109947,-0.720481,-0.02901,1.763063,0.494974,-0.271063,...,-0.31304,-0.484641,-0.02443,-0.052596,-0.040534,-0.031148,-0.008635,-0.008635,0.0,-0.28629
14821,-0.063564,0.49083,-0.177685,-0.171513,-0.100342,2.354121,-0.959617,-1.121143,0.235167,-0.271063,...,-0.31304,2.063382,-0.02443,-0.052596,-0.040534,-0.031148,-0.008635,-0.008635,0.0,-0.28629


# Logistic Regression model

In [9]:
lr = LogisticRegressionCV(Cs=[0.01,0.1,0.5,1,2,5,10,20],class_weight='balanced', cv=5)
lr.fit(x_train, y_train)

LogisticRegressionCV(Cs=[0.01, 0.1, 0.5, 1, 2, 5, 10, 20],
           class_weight='balanced', cv=5, dual=False, fit_intercept=True,
           intercept_scaling=1.0, max_iter=100, multi_class='ovr',
           n_jobs=1, penalty='l2', random_state=None, refit=True,
           scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [10]:
print(lr.score(x_dev,y_dev))
#print(metrics.accuracy_score(x_dev, y_dev))

0.7021552843417295


In [11]:
y_pred = lr.predict(x_dev)
#confusion_matrix(y_dev,y_pred)
cfnmatrix = pd.DataFrame(
            confusion_matrix(y_dev, y_pred), 
            columns=['Predicted=0', 'Predicted=1'], 
            index=['Actual=0', 'Actual=1']
)
print(cfnmatrix)

print("accuracy is {}".format(metrics.accuracy_score(y_dev, y_pred)))
print("f1 score is {}".format(metrics.f1_score(y_dev, y_pred)))
print("precisions is {}".format(metrics.precision_score(y_dev, y_pred)))
print("recall score is {}".format(metrics.recall_score(y_dev,y_pred)))

          Predicted=0  Predicted=1
Actual=0         2434         1065
Actual=1           82          270
accuracy is 0.7021552843417295
f1 score is 0.32009484291641965
precisions is 0.20224719101123595
recall score is 0.7670454545454546


In [47]:
#accuracy
(2366+290)/(2366+290+1130+65)
#recall
(290)/(290+65)
#precision
(290)/(290+1130)
#some other thing
(2366)/(2366+1130)

0.6896909893534147

In [12]:
# try gridsearchcv
#parameters = {'Cs':[0.01,0.1,0.5,1,5], 'cv':[1,3,5.8,10]}
parameters = {'C':[0.01,0.1,0.5,1,5]}
lr = LogisticRegression(penalty='l1')
gs = GridSearchCV(lr,parameters,cv=5)
gs.fit(x_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.01, 0.1, 0.5, 1, 5]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=0)

In [13]:
print(gs.best_score_)
gs.cv_results_['mean_test_score']

0.9129883686251118


array([0.91052789, 0.91164629, 0.91298837, 0.91291381, 0.91298837])

In [None]:
y_pred = lr.predict(x_dev)
#confusion_matrix(y_dev,y_pred)
cfnmatrix = pd.DataFrame(
            confusion_matrix(y_dev, y_pred), 
            columns=['Predicted=0', 'Predicted=1'], 
            index=['Actual=0', 'Actual=1']
)
print(cfnmatrix)

print("accuracy is {}".format(metrics.accuracy_score(y_dev, y_pred)))
print("f1 score is {}".format(metrics.f1_score(y_dev, y_pred)))
print("precisions is {}".format(metrics.precision_score(y_dev, y_pred)))
print("recall score is {}".format(metrics.recall_score(y_dev,y_pred)))

# try a decision tree

In [14]:
dt = tree.DecisionTreeClassifier(max_depth=None, class_weight=None)
dt.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [15]:
#benchmark score = 0.7878787878
y_pred = dt.predict(x_dev)
print(dt.max_features_)
print(dt.n_outputs_)
feats = np.argsort(-dt.feature_importances_)
print(dt.feature_importances_[feats])
print(x_train.columns[feats][:5])
cfnmatrix = pd.DataFrame(
            confusion_matrix(y_dev, y_pred), 
            columns=['Predicted=0', 'Predicted=1'], 
            index=['Actual=0', 'Actual=1']
)
print(cfnmatrix)
print("accuracy is {}".format(metrics.accuracy_score(y_dev, y_pred)))
print("f1 score is {}".format(metrics.f1_score(y_dev, y_pred)))
print("precisions is {}".format(metrics.precision_score(y_dev, y_pred)))
print("recall score is {}".format(metrics.recall_score(y_dev,y_pred)))

35
1
[2.81247546e-01 1.15844650e-01 1.12098248e-01 9.23263338e-02
 9.06472515e-02 5.78371902e-02 5.65035449e-02 4.66854734e-02
 3.70937366e-02 2.00111277e-02 1.52422203e-02 9.67057054e-03
 9.08662113e-03 9.08192572e-03 9.07849926e-03 7.64163042e-03
 6.88048331e-03 6.19865322e-03 4.71932932e-03 4.00376657e-03
 3.41797860e-03 2.27199333e-03 1.32456646e-03 8.21075505e-04
 2.65583817e-04 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00]
Index(['depth', 'biomass', 'z_min_light', 'ndvi', 'floor_temp'], dtype='object')
          Predicted=0  Predicted=1
Actual=0         3339          160
Actual=1          173          179
accuracy is 0.9135289535185666
f1 score is 0.5180897250361794
precisions is 0.528023598820059
recall score is 0.5085227272727273


# try random forest

In [22]:
dt = RandomForestClassifier(max_depth=None, class_weight=None, n_estimators=1000)
dt.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [23]:
y_pred = dt.predict(x_dev)
print(dt.n_outputs_)
feats = np.argsort(-dt.feature_importances_)
print(dt.feature_importances_[feats])
print(x_train.columns[feats][:5])
cfnmatrix = pd.DataFrame(
            confusion_matrix(y_dev, y_pred), 
            columns=['Predicted=0', 'Predicted=1'], 
            index=['Actual=0', 'Actual=1']
)
print(cfnmatrix)
print("accuracy is {}".format(metrics.accuracy_score(y_dev, y_pred)))
print("f1 score is {}".format(metrics.f1_score(y_dev, y_pred)))
print("precisions is {}".format(metrics.precision_score(y_dev, y_pred)))
print("recall score is {}".format(metrics.recall_score(y_dev,y_pred)))

1
[2.30986935e-01 1.34628668e-01 9.09981122e-02 8.98658050e-02
 7.33504540e-02 7.05494736e-02 6.94471177e-02 6.77814582e-02
 6.70967295e-02 1.16707338e-02 9.83267941e-03 9.32261703e-03
 9.25511270e-03 9.14065180e-03 8.41725591e-03 7.81418696e-03
 6.83757854e-03 5.74916765e-03 5.06352534e-03 4.85597748e-03
 4.03312211e-03 3.80254041e-03 3.25229600e-03 3.02199959e-03
 1.15492319e-03 7.72644978e-04 3.93136790e-04 2.88269941e-04
 2.39037475e-04 2.27532370e-04 1.16004492e-04 2.68575337e-05
 4.17676039e-06 3.21843110e-06 0.00000000e+00]
Index(['depth', 'z_min_light', 'ndvi', 'floor_temp', 'biomass'], dtype='object')
          Predicted=0  Predicted=1
Actual=0         3402           97
Actual=1          187          165
accuracy is 0.9262529213191378
f1 score is 0.5374592833876222
precisions is 0.6297709923664122
recall score is 0.46875
