# Modeling kelp site viability

In [82]:
import pandas as pd
import os, os.path
import urllib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn import tree
import numpy as np

%matplotlib inline

In [4]:
dir_name = "../data"
pickles = '../data/pickles/'
gcs_bucket = "https://storage.googleapis.com/w210data/"


#if not os.path.isdir(dir_name):
#    os.mkdir(dir_name)
    
#def download_file(file_name):
#    full_path = os.path.join(dir_name, file_name)
#    if not os.path.exists(full_path):
#        urllib.request.urlretrieve (gcs_bucket+file_name, full_path)

#download_file(pickles+'grid_df_tenth.pkl')

# Read data

In [5]:
grid_df = pd.read_pickle(pickles+'grid_df_hundredth.pkl')
#keep polygon_id as identifier
polygon_id = grid_df.polygon_id
#extract only features relevant to predicting kelp viability
grid_df = grid_df[['biomass','depth','mean_sst','max_sst','min_sst','ndvi',\
                   'z_min_light','z_mixedl','floor_temp','viable','shoretype','shoretype2','aerial_kelp']]
#clean up categorical variables
grid_df.shoretype = grid_df.shoretype.apply(lambda x: x[0][1])
grid_df.shoretype2 = grid_df.shoretype2.apply(lambda x: x[0][1])
grid_df.aerial_kelp = grid_df.aerial_kelp.apply(lambda x: x[0][1])
print(grid_df.shape)
grid_df.head()

(19160, 13)


Unnamed: 0,biomass,depth,mean_sst,max_sst,min_sst,ndvi,z_min_light,z_mixedl,floor_temp,viable,shoretype,shoretype2,aerial_kelp
0,0.0,-38.4,11.929242,14.013989,9.125995,0.008434,34.0,17.955626,8.156,False,Rocky Shores,exposed rocky cliffs,False
1,0.0,-34.8,11.929242,14.013989,9.125995,0.008434,31.0,17.955626,8.156,False,Rocky Shores,exposed rocky cliffs,False
2,0.0,-31.0,11.929242,14.013989,9.125995,0.008434,31.0,17.955626,8.156,True,Rocky Shores,exposed rocky cliffs,False
3,0.0,-31.0,11.929242,14.013989,9.125995,0.008434,28.0,17.955626,8.156,False,Rocky Shores,exposed rocky cliffs,False
4,0.0,-30.4,11.929242,14.013989,9.125995,0.008434,28.0,17.955626,8.156,False,Rocky Shores,exposed rocky cliffs,False


# Create dummy variables for categorical variables

In [6]:
#create dummy variables for the categorical variables
shoretype = pd.get_dummies(grid_df['shoretype'])
shoretype2 = pd.get_dummies(grid_df['shoretype2'])

#drop categorical columns; will add dummy columns subsequently
grid_df.drop(['shoretype','shoretype2'], axis=1, inplace=True)

#add dummy variables for shoretype and shoretype2
grid_df = grid_df.join(shoretype).join(shoretype2)
print(grid_df.shape)
grid_df.head()

(19160, 36)


Unnamed: 0,biomass,depth,mean_sst,max_sst,min_sst,ndvi,z_min_light,z_mixedl,floor_temp,viable,...,riprap,salt and brackish water marshes,salt marshes,scarps and steep slopes in sand,sheltered man-made structures,sheltered riprap,sheltered rocky shores,sheltered tidal flats,vegetated low riverine banks,wave cut rocky platforms
0,0.0,-38.4,11.929242,14.013989,9.125995,0.008434,34.0,17.955626,8.156,False,...,0,0,0,0,0,0,0,0,0,0
1,0.0,-34.8,11.929242,14.013989,9.125995,0.008434,31.0,17.955626,8.156,False,...,0,0,0,0,0,0,0,0,0,0
2,0.0,-31.0,11.929242,14.013989,9.125995,0.008434,31.0,17.955626,8.156,True,...,0,0,0,0,0,0,0,0,0,0
3,0.0,-31.0,11.929242,14.013989,9.125995,0.008434,28.0,17.955626,8.156,False,...,0,0,0,0,0,0,0,0,0,0
4,0.0,-30.4,11.929242,14.013989,9.125995,0.008434,28.0,17.955626,8.156,False,...,0,0,0,0,0,0,0,0,0,0


# split data into train/dev/test sets

In [7]:
#separate features from output classes
target = 'aerial_kelp'
predictors = [c for c in grid_df.columns if not c == target]

discrete = ['shoretype', 'shoretype2']
continuous = [i for i in grid_df.columns if not i in discrete and i != target]

y = grid_df[target]
x = grid_df[predictors]

#create train/dev/test sets
x_train, x_dev, y_train, y_dev = train_test_split(x,y, test_size=0.3, shuffle=True)
x_dev, x_test, y_dev, y_test = train_test_split(x_dev,y_dev, test_size=0.33)
print(x_train.shape)
print(x_dev.shape)
print(x_test.shape)

(13412, 35)
(3851, 35)
(1897, 35)


# Scale numeric variables

In [8]:
sum(y_train)/len(y_train)

0.09103787652848196

In [9]:
ss = StandardScaler()
ss.fit(x_train[continuous])
x_train[continuous] = ss.transform(x_train[continuous])
x_dev[continuous] = ss.transform(x_dev[continuous])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [10]:
x_train.head()

Unnamed: 0,biomass,depth,mean_sst,max_sst,min_sst,ndvi,z_min_light,z_mixedl,floor_temp,viable,...,riprap,salt and brackish water marshes,salt marshes,scarps and steep slopes in sand,sheltered man-made structures,sheltered riprap,sheltered rocky shores,sheltered tidal flats,vegetated low riverine banks,wave cut rocky platforms
7767,-0.059979,0.296761,-0.034193,-0.171032,-0.120723,-0.555341,-0.24169,1.627017,0.58725,-0.273433,...,-0.310593,-0.483703,-0.025913,-0.051879,-0.036659,-0.03456,-0.012212,-0.008635,-0.008635,-0.283383
11549,-0.059979,0.505809,-1.160038,-1.086785,-1.076197,-0.128612,-0.966755,0.79099,-2.204622,-0.273433,...,-0.310593,2.067386,-0.025913,-0.051879,-0.036659,-0.03456,-0.012212,-0.008635,-0.008635,-0.283383
7653,-0.059979,0.251316,-0.016551,-0.144908,-0.083115,-0.555341,0.056866,1.627017,0.58725,-0.273433,...,3.219643,-0.483703,-0.025913,-0.051879,-0.036659,-0.03456,-0.012212,-0.008635,-0.008635,-0.283383
16237,-0.059979,0.289187,1.500632,1.352801,1.652007,-0.532466,0.696628,0.473502,1.481566,3.657203,...,-0.310593,-0.483703,-0.025913,-0.051879,-0.036659,-0.03456,-0.012212,-0.008635,-0.008635,-0.283383
17489,-0.059979,0.49672,0.551945,0.56984,0.499326,-0.893005,-0.966755,-0.840599,0.448212,-0.273433,...,-0.310593,-0.483703,-0.025913,-0.051879,-0.036659,-0.03456,-0.012212,-0.008635,-0.008635,-0.283383


# Logistic Regression model

In [80]:
lr = LogisticRegressionCV(Cs=[0.01,0.1,0.5,1,2,5,10,20],class_weight='balanced', cv=5)
lr.fit(x_train, y_train)

LogisticRegressionCV(Cs=[0.01, 0.1, 0.5, 1, 2, 5, 10, 20],
           class_weight='balanced', cv=5, dual=False, fit_intercept=True,
           intercept_scaling=1.0, max_iter=100, multi_class='ovr',
           n_jobs=1, penalty='l2', random_state=None, refit=True,
           scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [81]:
print(lr.score(x_dev,y_dev))
#print(metrics.accuracy_score(x_dev, y_dev))

0.688911970916645


In [46]:
y_pred = lr.predict(x_dev)
#confusion_matrix(y_dev,y_pred)
cfnmatrix = pd.DataFrame(
            confusion_matrix(y_dev, y_pred), 
            columns=['Predicted=0', 'Predicted=1'], 
            index=['Actual=0', 'Actual=1']
)
print(cfnmatrix)

print("accuracy is {}".format(metrics.accuracy_score(y_dev, y_pred)))
print("f1 score is {}".format(metrics.f1_score(y_dev, y_pred)))
print("precisions is {}".format(metrics.precision_score(y_dev, y_pred)))
print("recall score is {}".format(metrics.recall_score(y_dev,y_pred)))

          Predicted=0  Predicted=1
Actual=0         2366         1130
Actual=1           65          290
accuracy is 0.6896909893534147
f1 score is 0.3267605633802817
precisions is 0.20422535211267606
recall score is 0.8169014084507042


In [47]:
#accuracy
(2366+290)/(2366+290+1130+65)
#recall
(290)/(290+65)
#precision
(290)/(290+1130)
#some other thing
(2366)/(2366+1130)

0.6896909893534147

In [75]:
# try gridsearchcv
#parameters = {'Cs':[0.01,0.1,0.5,1,5], 'cv':[1,3,5.8,10]}
parameters = {'C':[0.01,0.1,0.5,1,5]}
lr = LogisticRegression(penalty='l1')
gs = GridSearchCV(lr,parameters,cv=5)
gs.fit(x_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.01, 0.1, 0.5, 1, 5]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=0)

In [74]:
print(gs.best_score_)
gs.cv_results_['mean_test_score']

0.6893080823143454


array([0.65195347, 0.68543096, 0.68930808, 0.68856248, 0.68856248])

# try a decision tree