In [3]:
import boto3
import numpy as np
import pandas as pd
import pylab
import sagemaker
import keras
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_auc_score
from sagemaker import get_execution_role
from sklearn.model_selection import train_test_split
from os.path import join as opj
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Input, Flatten, Activation
from keras.layers import GlobalMaxPooling2D
from keras.layers.normalization import BatchNormalization
from keras.layers.merge import Concatenate
from keras.models import Model
from keras import initializers
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint,Callback,EarlyStopping,ReduceLROnPlateau

Using TensorFlow backend.





In [4]:
role = get_execution_role()
bucket='sagemaker-eu-north-1-845746890132'
data_key1='train.json'
data_location1='s3://{}/{}'.format(bucket,data_key1)
pd.read_json(data_location1)


Unnamed: 0,id,band_1,band_2,inc_angle,is_iceberg
0,dfd5f913,"[-27.878360999999998, -27.15416, -28.668615, -...","[-27.154118, -29.537888, -31.0306, -32.190483,...",43.9239,0
1,e25388fd,"[-12.242375, -14.920304999999999, -14.920363, ...","[-31.506321, -27.984554, -26.645678, -23.76760...",38.1562,0
2,58b2aaa0,"[-24.603676, -24.603714, -24.871029, -23.15277...","[-24.870956, -24.092632, -20.653963, -19.41104...",45.2859,1
3,4cfc3a18,"[-22.454607, -23.082819, -23.998013, -23.99805...","[-27.889421, -27.519794, -27.165262, -29.10350...",43.8306,0
4,271f93f4,"[-26.006956, -23.164886, -23.164886, -26.89116...","[-27.206915, -30.259186, -30.259186, -23.16495...",35.6256,0
...,...,...,...,...,...
1599,04e11240,"[-30.999878, -29.976866, -28.233906, -29.50732...","[-27.847719, -28.233864, -24.712077999999998, ...",na,0
1600,c7d6f6f8,"[-25.31155, -26.511555, -28.694487, -27.180115...","[-29.563713, -28.290375, -26.839405, -28.29046...",na,0
1601,bba1a0f1,"[-18.141895, -18.141844, -19.01737, -19.701599...","[-25.305355, -29.387701, -28.963863, -26.16023...",na,0
1602,7f66bb44,"[-22.455633, -25.794661, -26.954567, -22.83354...","[-26.070356, -22.093737, -21.577662, -24.53376...",na,0


In [5]:
#Load data
plt.rcParams['figure.figsize'] = 10, 10
train = pd.read_json(data_location1)
#train.inc_angle = train.inc_angle.replace('na',0)
train.inc_angle = train.inc_angle.replace('na',np.nan)
train.inc_angle = train.inc_angle.astype(float).fillna(0.0)
print('done!')

done!


In [6]:
#Generate the training data
X_band_1=np.array([np.array(band).astype(np.float32).reshape(75, 75) for band in train["band_1"]])
X_band_2=np.array([np.array(band).astype(np.float32).reshape(75, 75) for band in train["band_2"]])
X_train = np.concatenate([X_band_1[:, :, :, np.newaxis], X_band_2[:, :, :, np.newaxis],((X_band_1+X_band_2)/2)[:, :, :, np.newaxis]], axis=-1)
X_angle_train=np.array(train.inc_angle)
#y_train=np.array(train["is_iceberg"]
X_train.shape

#define the model
def getModel():
    gmodel=Sequential()
    
    #Conv Layer 1
    gmodel.add(Conv2D(64, kernel_size=(3, 3),activation='relu', input_shape=(75, 75, 3)))
    gmodel.add(MaxPooling2D(pool_size=(3, 3), strides=(2, 2)))
    gmodel.add(Dropout(0.2))
    
    #Conv Layer 2
    gmodel.add(Conv2D(128, kernel_size=(3, 3), activation='relu' ))
    gmodel.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
    gmodel.add(Dropout(0.2))
    
    #Conv Layer 3
    gmodel.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
    gmodel.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
    gmodel.add(Dropout(0.2))
    
    #Conv Layer 4
    gmodel.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
    gmodel.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
    gmodel.add(Dropout(0.2))
    
    #Flatten the data for upcoming dense layers
    gmodel.add(Flatten())
    
    #Dense Layers
    gmodel.add(Dense(512))
    gmodel.add(Activation('relu'))
    gmodel.add(Dropout(0.2))
    
    #Dense Layer 2
    gmodel.add(Dense(256))
    gmodel.add(Activation('relu'))
    gmodel.add(Dropout(0.2))
    
    #Sigmoid Layer
    gmodel.add(Dense(1))
    gmodel.add(Activation('sigmoid'))

    mypotim=Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8, decay=0.0)
   # mypotim=Adam(lr=0.001,decay=0.0)
    gmodel.compile(loss='binary_crossentropy',
                  optimizer=mypotim,
                  metrics=['accuracy'])
    gmodel.summary()
    return gmodel

In [7]:
#get ready to train the model    
def get_callbacks(filepath, patience=3):
    es = EarlyStopping('val_loss', patience=patience, mode="min")
    msave = ModelCheckpoint(filepath, save_best_only=True)
    reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss',factor= 0.1,patience=7,verbose=1,epsilon=1e-4, mode ='min')
    return [es, msave,reduce_lr_loss]

file_path = ".model_weights.hdf5"
callbacks = get_callbacks(filepath=file_path, patience=10)

   
y_train=np.array(train['is_iceberg'])

##X_train_cv, X_valid, y_train_cv, y_valid = train_test_split(X_train, y_train, random_state=1, train_size=0.9) 
#X_train_cv, X_test, y_train_cv, y_test = train_test_split(X_train, y_train, random_state=1, train_size=0.9)

indices = range (len(y_train))
X_train_cv, X_test, y_train_cv, y_test,indices_train,indices_test = train_test_split(X_train, y_train, indices,random_state=1, train_size=0.9)

print(indices_test)

    
#Without denoising, core features
import os
gmodel=getModel()
gmodel.fit(X_train_cv, y_train_cv,
         batch_size=32,
         epochs=50,
         verbose=1,
          #validation_data=(X_valid, y_valid),
validation_split=0.25,
         callbacks=callbacks)

gmodel.load_weights(file_path)
score = gmodel.evaluate(X_test, y_test, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

[571, 654, 525, 823, 659, 1148, 1398, 389, 111, 1102, 640, 1017, 1323, 1302, 851, 1024, 403, 181, 1427, 536, 986, 688, 1365, 1114, 351, 1038, 353, 761, 553, 301, 236, 275, 1505, 177, 613, 596, 190, 158, 8, 421, 1463, 107, 1086, 801, 1322, 442, 1373, 679, 1420, 1194, 264, 987, 1503, 285, 692, 60, 561, 75, 693, 268, 936, 304, 372, 1506, 1521, 924, 1388, 929, 976, 977, 992, 1338, 647, 480, 108, 65, 1329, 1523, 1582, 1315, 1279, 292, 597, 443, 223, 1245, 962, 53, 697, 1363, 937, 529, 1036, 1053, 1488, 948, 772, 966, 1213, 1566, 1066, 774, 48, 997, 1319, 1432, 1135, 1169, 495, 921, 91, 1316, 1357, 1466, 1436, 194, 598, 1535, 231, 30, 1009, 841, 918, 703, 547, 432, 631, 635, 1307, 614, 37, 258, 629, 1085, 825, 422, 241, 426, 1335, 201, 303, 858, 1119, 202, 724, 80, 1115, 1553, 101, 368, 1165, 959, 608, 1328, 1240, 102, 267, 447, 1371, 1108, 1057]









Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 73, 73, 64)        1792      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 36, 36, 64)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 36, 36, 64)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 34, 34, 128)       73856     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 17, 17, 128)       0         
____________________________________________________________

In [8]:
print(score)

[0.31038373807858305, 0.8757763975155279]


In [9]:
#this is the one with indices

[0 0 1 0 1 0 0 1 1 1 0 0 0 1 0 0 1 0 1 1 0 0 0 1 0 0 1 1 1 0 1 1 0 1 1 0 0
 1 0 0 0 0 0 1 0 0 1 1 1 1 0 1 0 1 1 1 0 1 0 1 1 0 0 1 0 1 0 0 0 1 1 0 1 1
 1 0 0 0 0 1 1 1 1 1 1 0 1 1 0 1 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 1 1
 0 1 0 0 0 0 0 1 0 1 1 0 1 1 0 0 1 0 1 0 1 0 1 1 1 1 1 1 1 0 0 0 1 0 0 0 0
 0 1 0 0 0 0 0 1 1 1 1 0 0]

SyntaxError: invalid syntax (<ipython-input-9-47f240e6c8d8>, line 3)

In [15]:
ids = ([571, 654, 525, 823, 659, 1148, 1398, 389, 111, 1102, 640, 1017, 1323, 1302, 851, 1024, 403, 181, 1427, 536, 986, 688, 1365, 1114, 351, 1038, 353, 761, 553, 301, 236, 275, 1505, 177, 613, 596, 190, 158, 8, 421, 1463, 107, 1086, 801, 1322, 442, 1373, 679, 1420, 1194, 264, 987, 1503, 285, 692, 60, 561, 75, 693, 268, 936, 304, 372, 1506, 1521, 924, 1388, 929, 976, 977, 992, 1338, 647, 480, 108, 65, 1329, 1523, 1582, 1315, 1279, 292, 597, 443, 223, 1245, 962, 53, 697, 1363, 937, 529, 1036, 1053, 1488, 948, 772, 966, 1213, 1566, 1066, 774, 48, 997, 1319, 1432, 1135, 1169, 495, 921, 91, 1316, 1357, 1466, 1436, 194, 598, 1535, 231, 30, 1009, 841, 918, 703, 547, 432, 631, 635, 1307, 614, 37, 258, 629, 1085, 825, 422, 241, 426, 1335, 201, 303, 858, 1119, 202, 724, 80, 1115, 1553, 101, 368, 1165, 959, 608, 1328, 1240, 102, 267, 447, 1371, 1108, 1057])
len(ids)

161

In [11]:
pred_y_proba = gmodel.predict(X_test)
pred_y = list(map(lambda x : 0 if x < 0.5 else 1, pred_y_proba))

In [12]:
#accuracy_test = accuracy_score(y_test,pred_y)
f1 = f1_score(y_test,pred_y)#,average='samples')
con_mat = confusion_matrix(y_test,pred_y)
roc_auc = roc_auc_score(y_test,pred_y_proba)
accuracy = accuracy_score(y_test, pred_y)

#print("accuracy_test is:", accuracy_test)
print("f1 score:", f1)
print("confusion matrix:\n", con_mat)
print("roc_auc_score:", roc_auc)
print("accuracy:", accuracy)


f1 score: 0.8648648648648648
confusion matrix:
 [[77  9]
 [11 64]]
roc_auc_score: 0.9519379844961241
accuracy: 0.8757763975155279


In [13]:
from sklearn.metrics import roc_curve, auc
fpr,tpr,_ = roc_curve(y_test, pred_y_proba)
print(auc(fpr,tpr))


0.9519379844961241


In [27]:
#print(X_test)
list(filter(lambda a: not(a[2] == a[3]), list(zip(ids, pred_y_proba, pred_y, y_test))))
#pred_y_proba = gmodel.predict(X_test)
#pred_y = list(map(lambda x : 0 if x < 0.5 else 1, pred_y_proba))

[(986, array([0.5981706], dtype=float32), 1, 0),
 (688, array([0.5083423], dtype=float32), 1, 0),
 (353, array([0.47281003], dtype=float32), 0, 1),
 (158, array([5.120039e-05], dtype=float32), 0, 1),
 (679, array([0.11338177], dtype=float32), 0, 1),
 (987, array([0.3136894], dtype=float32), 0, 1),
 (561, array([0.85772836], dtype=float32), 1, 0),
 (304, array([0.63711745], dtype=float32), 1, 0),
 (977, array([0.48064205], dtype=float32), 0, 1),
 (1329, array([0.66631013], dtype=float32), 1, 0),
 (1523, array([0.60454106], dtype=float32), 1, 0),
 (1582, array([0.8245748], dtype=float32), 1, 0),
 (1279, array([0.265836], dtype=float32), 0, 1),
 (292, array([0.17510316], dtype=float32), 0, 1),
 (1245, array([0.6194291], dtype=float32), 1, 0),
 (962, array([0.4807846], dtype=float32), 0, 1),
 (48, array([0.7864267], dtype=float32), 1, 0),
 (547, array([0.44715014], dtype=float32), 0, 1),
 (426, array([0.34863183], dtype=float32), 0, 1),
 (102, array([0.1450397], dtype=float32), 0, 1)]

In [29]:
list(filter(lambda a: a[2] == a[3], list(zip(ids, pred_y_proba, pred_y, y_test))))

[(571, array([0.], dtype=float32), 0, 0),
 (654, array([0.], dtype=float32), 0, 0),
 (525, array([0.99464226], dtype=float32), 1, 1),
 (823, array([0.], dtype=float32), 0, 0),
 (659, array([0.68386734], dtype=float32), 1, 1),
 (1148, array([0.1663521], dtype=float32), 0, 0),
 (1398, array([0.00015089], dtype=float32), 0, 0),
 (389, array([0.9967981], dtype=float32), 1, 1),
 (111, array([0.8010453], dtype=float32), 1, 1),
 (1102, array([0.54860985], dtype=float32), 1, 1),
 (640, array([0.06976607], dtype=float32), 0, 0),
 (1017, array([0.], dtype=float32), 0, 0),
 (1323, array([0.00821564], dtype=float32), 0, 0),
 (1302, array([0.96749073], dtype=float32), 1, 1),
 (851, array([0.], dtype=float32), 0, 0),
 (1024, array([0.], dtype=float32), 0, 0),
 (403, array([0.9850304], dtype=float32), 1, 1),
 (181, array([0.], dtype=float32), 0, 0),
 (1427, array([0.9834562], dtype=float32), 1, 1),
 (536, array([0.8695849], dtype=float32), 1, 1),
 (1365, array([0.], dtype=float32), 0, 0),
 (1114, arr