## Notebook to prepare/see results of Cross-validation for open- and closed-cell St

In [1]:
%reload_ext autoreload
%autoreload 2
import os
import sys
import copy
import glob
import random
import numpy as np
import scipy as sc
import pandas as pd
import tensorflow as tf
from pyhdf.SD import SD, SDC 
from scipy import stats

# visualization
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
from matplotlib import patches as mpl_patches

In [45]:
import gc

In [151]:
#version1
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn import metrics
from sklearn.neural_network import MLPClassifier 
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

#version 2
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import ShuffleSplit

In [2]:
datadir = "/home/tkurihana/scratch-midway2/data/clouds"

In [11]:
class Patch:        
    def __init__(self, date, isOpen, thirtyFive, zeroTwo, label=None, feature=None, has_coord=False, coords=None):
        self.date = date
        self.isOpen = isOpen
        self.thirtyFive = thirtyFive
        self.zeroTwo = zeroTwo
        self.label = label
        self.feature = feature
        self.has_coord = has_coord
        self.coords = coords
        
    def print_attr(self):
        print("date: " + self.date)
        print("isOpen: " + str(self.isOpen))
        print("label: " + str(self.label))
        if len(self.coords) > 0:
            print("coords: ")
            for i in self.coords:
                print(str(i))

#### Download patches

- Closed cell

In [20]:
cdatadir = "/project2/foster/clouds/src_analysis/labeled_clouds/close_cells_mod02/"

In [83]:
cpatches_list = []
cfilelist= glob.glob(os.path.join(cdatadir, "*.npy"))
for filename in cfilelist:
    tmp = np.load(filename)
    for patch in tmp:
        #print( len(np.where(np.isnan(patch))[0]) )
        if len(np.where(np.isnan(patch))[0]) == 0:
            cpatches_list.append(patch)
cpatches = np.squeeze(np.concatenate(cpatches_list, axis=0), axis=(1) )

In [84]:
cpatches.shape

(120, 128, 128, 6)

In [85]:
del cpatches_list

- Open cell

In [28]:
odatadir = "/project2/foster/clouds/src_analysis/labeled_clouds/open_cells_mod02"

In [86]:
""" one file is only [8] dimension
"""
opatches_list = []
ofilelist = glob.glob(os.path.join(odatadir, "*.npy"))
for filename in ofilelist:
    a = np.load(filename, allow_pickle=True)
    if a.size > 100:
        for patch in a:
            if len(np.where(np.isnan(patch))[0]) == 0:
                opatches_list.append(patch)
opatches = np.squeeze(np.concatenate(opatches_list, axis=0), axis=(1) )

In [87]:
opatches.shape

(117, 128, 128, 6)

In [88]:
del opatches_list

In [152]:
gc.collect()

415

In [90]:
### Patches
patches = np.concatenate([cpatches, opatches], axis=0)

In [91]:
patches.shape

(237, 128, 128, 6)

##### Make label: 0; closed, 1; open cell

In [92]:
ctest = np.zeros((cpatches.shape[0]))
otest = np.ones((opatches.shape[0]))
label = np.concatenate([ctest, otest])

In [93]:
label.shape

(237,)

#### Practice: 1-time Valiadtion

In [94]:
X_train, X_test, y_train, y_test = train_test_split(patches, label, test_size=0.4, random_state=0)

In [101]:
X_train.shape, y_train.shape

((142, 128, 128, 6), (142,))

In [106]:
n,h,w,c = X_train.shape

In [107]:
clf = svm.SVC(kernel='linear', C=1).fit(X_train.reshape(n, h*w*c), y_train)

In [109]:
n,h,w,c = X_test.shape
clf.score(X_test.reshape(n, h*w*c), y_test)

0.9157894736842105

#### Practice: Cross-validation

In [112]:
clf = svm.SVC(kernel='linear', C=1)
n,h,w,c = patches.shape
scores = cross_val_score(clf, patches.reshape(n, h*w*c), label, cv=5)

In [113]:
scores

array([0.83333333, 0.9375    , 0.82978723, 0.85106383, 0.85106383])

with metric

In [115]:
scores = cross_val_score(clf, X, y, cv=5, scoring='f1_macro')

In [116]:
scores

array([0.96658312, 1.        , 0.96658312, 0.96658312, 1.        ])

#### Practice: Cross-validate function

In [118]:
scoring = ['precision_macro', 'recall_macro']
clf = svm.SVC(kernel='linear', C=1, random_state=0)
scores = cross_validate(clf, patches.reshape(n, h*w*c), label, cv=5, scoring=scoring)
sorted(scores.keys())
for ikey in ['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro']:
    print(ikey, scores[ikey])

fit_time [2.18492413 2.4025774  1.95280457 1.84609723 2.21522784]
score_time [0.89239097 1.02013183 0.76489997 0.78053904 0.86005235]
test_precision_macro [0.83333333 0.93826087 0.83181818 0.85740741 0.88709677]
test_recall_macro [0.83333333 0.9375     0.83061594 0.84963768 0.84782609]


#### Practice: Repeate Cross validation

In [125]:
ss = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
for train_index, test_index in ss.split(np.arange(0, patches.shape[0],1)):
    #print("%s %s" % (train_index, test_index))
    X_train = patches[train_index]
    Y_test  = patches[test_index]
    x_label = label[train_index]
    y_label = label[test_index]

### Use autoencoder and check performance

In [126]:
def load_latest_model(model_dir, mtype):
    #TODO add restart model dir and restart argument?
    latest = 0, None
    # get trained wegiht 
    for m in os.listdir(model_dir):
        if ".h5" in m and mtype in m:
            epoch = int(m.split("-")[1].replace(".h5", ""))
            latest = max(latest, (epoch, m))

    epoch, model_file = latest

    if not os.listdir(model_dir):
        raise NameError("no directory. check model path again")

    print(" Load {} at {} epoch".format(mtype, epoch))
    model_def = model_dir+'/'+mtype+'.json'
    model_weight = model_dir+'/'+mtype+'-'+str(epoch)+'.h5'
    with open(model_def, "r") as f:
        model = tf.keras.models.model_from_json(f.read())
    model.load_weights(model_weight)
    return model

#### RI autoencoder 

In [131]:
# 67011582; Best model
model_datadir = '/home/tkurihana/rotate_invariant/stepbystep/transform/output_model'
expname = 67011582
model_dir = os.path.join(model_datadir,str(expname) )
encoder = load_latest_model(model_dir, mtype='encoder')

 Load encoder at 100 epoch


In [148]:
# change patch size
height = width = 32
rpatches_tf = tf.image.resize_images(patches, (height, width))
rpatches_tf = tf.cast(rpatches_tf, tf.float64)
rpatches = tf.keras.backend.eval(rpatches_tf)

# standardization
nmin = np.amin(rpatches, axis=(0,1,2))
nmax = np.amax(rpatches, axis=(0,1,2))
rpatches = (rpatches - nmin)/(nmax - nmin)
rpatches.shape

(237, 32, 32, 6)

Linear

In [149]:
scoring = ['precision_macro', 'recall_macro']
clf = svm.SVC(kernel='linear', C=1, random_state=0)
encs = encoder.predict(rpatches)
n,h,w,c = encs.shape
scores = cross_validate(clf, encs.reshape(n, h*w*c), label, cv=5, scoring=scoring)
sorted(scores.keys())
for ikey in ['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro']:
    print(ikey, scores[ikey])

fit_time [0.02354741 0.02398419 0.02123642 0.01896548 0.02294421]
score_time [0.01252246 0.01094294 0.00980878 0.00849438 0.01057386]
test_precision_macro [0.81304348 0.94444444 0.83181818 0.78909091 0.92857143]
test_recall_macro [0.8125     0.9375     0.83061594 0.78804348 0.91304348]


RBF

In [150]:
scoring = ['precision_macro', 'recall_macro']
clf = svm.SVC(gamma=2, random_state=0)
encs = encoder.predict(rpatches)
n,h,w,c = encs.shape
scores = cross_validate(clf, encs.reshape(n, h*w*c), label, cv=5, scoring=scoring)
sorted(scores.keys())
for ikey in ['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro']:
    print(ikey, scores[ikey])

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


fit_time [0.0869801  0.09412098 0.09649682 0.0819664  0.08466935]
score_time [0.07198858 0.05639362 0.06329393 0.04183602 0.05602431]
test_precision_macro [0.25       0.24468085 0.25531915 0.25531915 0.25531915]
test_recall_macro [0.5        0.47916667 0.5        0.5        0.5       ]


MLP

In [158]:
scoring = ['precision_macro', 'recall_macro']
#clf =  GaussianProcessClassifier(1.0 * RBF(1.0))
clf =  MLPClassifier(alpha=1, max_iter=1000)
encs = encoder.predict(rpatches)
n,h,w,c = encs.shape
scores = cross_validate(clf, encs.reshape(n, h*w*c), label, cv=2, scoring=scoring)
sorted(scores.keys())
for ikey in ['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro']:
    print(ikey, scores[ikey])

fit_time [247.19460917 304.24902368]
score_time [0.17841578 0.12321925]
test_precision_macro [0.91360505 0.89105339]
test_recall_macro [0.90706215 0.88936782]


In [165]:
scoring = ['precision_macro', 'recall_macro']
#clf =  GaussianProcessClassifier(1.0 * RBF(1.0))
clf =  MLPClassifier(alpha=1, max_iter=1000)
encs = encoder.predict(rpatches)
n,h,w,c = encs.shape
scores = cross_validate(clf, encs.reshape(n, h*w*c), label, cv=5, scoring=scoring)
sorted(scores.keys())
for ikey in ['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro']:
    print(ikey, scores[ikey])



fit_time [276.62142754 353.07320738 346.02472138 275.18099546 289.90711761]
score_time [0.02053118 0.09346724 0.00628734 0.00721383 0.0796268 ]
test_precision_macro [0.87023593 0.96153846 0.85144928 0.79537037 0.91849817]
test_recall_macro [0.85416667 0.95833333 0.85144928 0.78894928 0.91394928]


RandomForest

In [161]:
scoring = ['precision_macro', 'recall_macro']
clf =  RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
encs = encoder.predict(rpatches)
n,h,w,c = encs.shape
scores = cross_validate(clf, encs.reshape(n, h*w*c), label, cv=5, scoring=scoring)
sorted(scores.keys())
for ikey in ['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro']:
    print(ikey, scores[ikey])

fit_time [0.01461339 0.01359177 0.01640391 0.00810218 0.00860667]
score_time [0.00445437 0.00429702 0.0045228  0.0026834  0.00292873]
test_precision_macro [0.89652174 0.93826087 0.77819549 0.89454545 0.92857143]
test_recall_macro [0.89583333 0.9375     0.76811594 0.89311594 0.91304348]


AdaBoost

In [163]:
scoring = ['precision_macro', 'recall_macro']
clf =  AdaBoostClassifier()
encs = encoder.predict(rpatches)
n,h,w,c = encs.shape
scores = cross_validate(clf, encs.reshape(n, h*w*c), label, cv=5, scoring=scoring)
sorted(scores.keys())
for ikey in ['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro']:
    print(ikey, scores[ikey])

fit_time [1.92635083 2.44037914 2.39723921 2.35921383 2.44672012]
score_time [0.02276087 0.02757215 0.02309465 0.02349663 0.02296019]
test_precision_macro [0.91666667 0.94444444 0.90092593 0.89454545 0.9137931 ]
test_recall_macro [0.91666667 0.9375     0.89221014 0.89311594 0.89130435]


#### NRI autoencoder

In [136]:
# 'm2_02_global_2000_2018_band28_29_31'
model_datadir = '/home/tkurihana/rotate_invariant/stepbystep/transform/output_model'
expname = 'm2_02_global_2000_2018_band28_29_31'
model_dir = os.path.join(model_datadir,str(expname) )
nriencoder = load_latest_model(model_dir, mtype='encoder')

 Load encoder at 100000 epoch




In [137]:
scoring = ['precision_macro', 'recall_macro']
clf = svm.SVC(kernel='linear', C=1, random_state=0)
encs = nriencoder.predict(patches)
n,h,w,c = encs.shape
scores = cross_validate(clf, encs.reshape(n, h*w*c), label, cv=5, scoring=scoring)
sorted(scores.keys())
for ikey in ['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro']:
    print(ikey, scores[ikey])

fit_time [0.12828207 0.15790749 0.12065506 0.1190412  0.13650012]
score_time [0.05376101 0.07097888 0.05228615 0.05123711 0.0618856 ]
test_precision_macro [0.9021164  0.96153846 0.87545788 0.83971774 0.9137931 ]
test_recall_macro [0.89583333 0.95833333 0.87137681 0.80525362 0.89130435]


In [159]:
scoring = ['precision_macro', 'recall_macro']
#clf =  GaussianProcessClassifier(1.0 * RBF(1.0))
clf =  MLPClassifier(alpha=1, max_iter=1000)
encs = nriencoder.predict(patches)
n,h,w,c = encs.shape
scores = cross_validate(clf, encs.reshape(n, h*w*c), label, cv=2, scoring=scoring)
sorted(scores.keys())
for ikey in ['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro']:
    print(ikey, scores[ikey])



fit_time [ 10.70810747 766.75788379]
score_time [0.41641521 0.47076082]
test_precision_macro [0.92253521 0.91666667]
test_recall_macro [0.90677966 0.89655172]


In [166]:
scoring = ['precision_macro', 'recall_macro']
#clf =  GaussianProcessClassifier(1.0 * RBF(1.0))
clf =  MLPClassifier(alpha=1, max_iter=1000)
encs = nriencoder.predict(patches)
n,h,w,c = encs.shape
scores = cross_validate(clf, encs.reshape(n, h*w*c), label, cv=5, scoring=scoring)
sorted(scores.keys())
for ikey in ['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro']:
    print(ikey, scores[ikey])



fit_time [813.8486774  596.98655963  22.70966363 927.69674039  20.7441051 ]
score_time [0.01212454 0.01099968 0.022084   0.01797223 0.01471686]
test_precision_macro [0.92857143 0.98       0.92857143 0.85392157 0.89454545]
test_recall_macro [0.91666667 0.97916667 0.91304348 0.82699275 0.89311594]


In [162]:
scoring = ['precision_macro', 'recall_macro']
clf =  RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
encs = nriencoder.predict(patches)
n,h,w,c = encs.shape
scores = cross_validate(clf, encs.reshape(n, h*w*c), label, cv=5, scoring=scoring)
sorted(scores.keys())
for ikey in ['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro']:
    print(ikey, scores[ikey])

fit_time [0.02940154 0.02922893 0.02957559 0.02761221 0.02761197]
score_time [0.0088613  0.00885057 0.00910378 0.00872827 0.0088532 ]
test_precision_macro [0.85555556 0.91958042 0.87545788 0.746337   0.8843985 ]
test_recall_macro [0.83333333 0.91666667 0.87137681 0.74365942 0.87047101]


In [164]:
scoring = ['precision_macro', 'recall_macro']
clf =  AdaBoostClassifier()
encs = nriencoder.predict(patches)
n,h,w,c = encs.shape
scores = cross_validate(clf, encs.reshape(n, h*w*c), label, cv=5, scoring=scoring)
sorted(scores.keys())
for ikey in ['fit_time', 'score_time', 'test_precision_macro', 'test_recall_macro']:
    print(ikey, scores[ikey])

fit_time [9.44690204 8.9894042  9.40780926 9.28659058 9.74264002]
score_time [0.02756834 0.03544092 0.0301137  0.03513455 0.03478169]
test_precision_macro [0.9        0.96153846 0.80887681 0.77037037 0.86877395]
test_recall_macro [0.875      0.95833333 0.80887681 0.76449275 0.84873188]
