Error Analysis on the Baseline Model.

# **Setup**

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import configparser
import os.path
from os import path

WANDB_enable = False
creds_path_ar = ["../credentials.ini","credentials.colab.ini"]
root_path = ""
data_path = ""

for creds_path in creds_path_ar:
  if path.exists(creds_path):
      config_parser = configparser.ConfigParser()
      config_parser.read(creds_path)
      root_path = config_parser['MAIN']["PATH_ROOT"]
      data_path = config_parser['MAIN']["PATH_DATA"]
      WANDB_enable = config_parser['MAIN']["WANDB_ENABLE"] == 'TRUE'
      ENV = config_parser['MAIN']["ENV"]
      break


In [None]:
if ENV=="COLAB":
  !pip install tensorflow-determinism


In [None]:
if ENV=="COLAB":
  from google.colab import drive
  mount_path = '/content/gdrive/'
  drive.mount(mount_path)

In [None]:
cd {root_path}

In [None]:
import os
import sys
import random
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from importlib import reload  #use 'reload' to reload module manually if it was changed

from tensorflow.keras import backend as K
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPooling2D, Dropout
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC
from sklearn.metrics import roc_auc_score, roc_curve, auc
from matplotlib.colors import LinearSegmentedColormap
from termcolor import colored

from src.utils import experiment_utils as utils


# Set seed for reproducibility of results
seed_value = 0
os.environ['PYTHONHASHSEED']=str(seed_value)
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

tf.compat.v1.reset_default_graph()

random.seed(seed_value)
np.random.seed(seed_value)
tf.random.set_seed(seed_value)
tf.compat.v1.set_random_seed(seed_value)

# Configure a new global `tensorflow` session
session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
tf.compat.v1.keras.backend.set_session(sess)

tf.__version__


In [None]:

if WANDB_enable:
  !pip install --upgrade wandb
  !wandb login {config_parser['MAIN']["WANDB_LOGIN"]}
  import wandb
  from wandb.keras import WandbCallback


In [None]:
# Set and test path to competition data files

if data_path is '':
  data_path = os.path.join(os.getcwd(),"data") 

try:
  file_path = 'MAFAT RADAR Challenge - Training Set V1.csv'
  with open(f'{data_path}/{file_path}') as f:
    f.readlines()
  print(colored('Everything is setup correctly', color='green'))
except:
  print(colored('Please mount drive and set data_path correctly',
                color='red'))

In [None]:
# Building the model
def create_model(input_shape, init):
  """
  CNN model.

  Arguments:
    input_shape -- the shape of our input
    init -- the weight initialization

  Returns:
    CNN model    
  """
  model = Sequential()
  model.add(Conv2D(16, kernel_size=(3, 3), activation='relu', kernel_initializer = init, bias_regularizer='l2', input_shape=input_shape))
  model.add(MaxPooling2D(pool_size=(2, 2)))
  model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', kernel_initializer = init, bias_regularizer='l2'))
  model.add(MaxPooling2D(pool_size=(2, 2)))
  model.add(Flatten())
  model.add(Dense(128, kernel_regularizer = 'l2', activation='relu', kernel_initializer = init))
  model.add(Dense(32, kernel_regularizer = 'l2', activation='relu', kernel_initializer = init))
  model.add(Dense(1, activation='sigmoid', kernel_initializer = init))
  return model

In [None]:
# Loading and preparing the data

# Loading Auxiliary Experiment set - can take a few minutes
experiment_auxiliary = 'MAFAT RADAR Challenge - Auxiliary Experiment Set V2'
experiment_auxiliary_df = utils.load_data(experiment_auxiliary, folder=data_path )
experiment_auxiliary_df['date_index'].shape

In [None]:
# Taking sample from the Auxiliary Experiment set
train_aux = utils.aux_split(experiment_auxiliary_df)
train_aux['date_index'].shape

In [None]:
# Training set
train_path = 'MAFAT RADAR Challenge - Training Set V1'
training_df = utils.load_data(train_path, folder=data_path )
print(f"real dataset({training_df['date_index'].shape})",end='')

# Adding segments from the experiment auxiliary set to the training set
train_df = utils.append_dict(training_df, train_aux)
print(f" + aux dataset({train_aux['date_index'].shape}) = full train({train_df['date_index'].shape})")

# Preprocessing and split the data to training and validation
train_df = utils.data_preprocess(train_df.copy())
train_x, train_y, val_x, val_y, is_validation_ar = utils.split_train_val(train_df)

train_df_t = train_df.copy()
del train_df_t['doppler_burst']
del train_df_t['iq_sweep_burst']
train_dff = pd.DataFrame(train_df_t)
train_dff['is_validation']=is_validation_ar

val_y =  val_y.astype(int)
train_y =train_y.astype(int)
train_x = train_x.reshape(list(train_x.shape)+[1])
val_x = val_x.reshape(list(val_x.shape)+[1])

print(f"train only:{train_x.shape[0]}).  val only:{val_x.shape[0]}")


In [None]:
train_dff.head(20)
#train_dff.to_csv("train_dff.csv", sep='\t')

In [None]:
# Public test set - loading and preprocessing
test_path = 'MAFAT RADAR Challenge - Public Test Set V1'
test_df = utils.load_data(test_path, folder=data_path )
test_df = utils.data_preprocess(test_df.copy())
test_x = test_df['iq_sweep_burst']
test_x = test_x.reshape(list(test_x.shape)+[1])

In [None]:
# Model configuration:
batch_size = 16
img_width, img_height = 126, 32
loss_function = BinaryCrossentropy()
no_epochs = 10
optimizer = Adam(learning_rate = 0.001)
input_shape = (img_width, img_height, 1)

init = tf.keras.initializers.GlorotNormal(seed = 0)

# Creating and running the model
model = create_model(input_shape, init)  
model.compile(loss=loss_function, optimizer=optimizer, metrics=[AUC(), 'accuracy'])


**Model Architecture**   
   
![](https://drive.google.com/uc?export=view&id=1wsJBHbghEPGT0s1QQG6BHl7MS3Yo0o4i)

In [None]:
model.summary()

In [None]:
# Model fit


callbacks = []
if WANDB_enable:
  wandb.init(project="sota-mafat",name="first")
  callbacks.append(WandbCallback())

history = model.fit(train_x, train_y, batch_size = batch_size, epochs = no_epochs, 
                    validation_data = (val_x, val_y), callbacks=callbacks)

#### **Results**
Submissions are evaluated on the area under the Receiver Operating Characteristic Curve ([ROC AUC](https://en.wikipedia.org/wiki/Receiver_operating_characteristic))   
on the predicted probabilities, as calculated by [roc_auc_score in scikit-learn (v 0.23.1)](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html).


In [None]:
# Plot ROC curve and show ROC-AUC results of the training and validation sets. 
pred = [model.predict(train_x), model.predict(val_x)]
actual = [train_y, val_y]
utils.stats(pred, actual)

## **Final Submission File**

Create a CSV submission file , zip and download it.

In [None]:
# Creating DataFrame with the probability prediction for each segment
submission =  pd.DataFrame()
submission['segment_id'] = test_df['segment_id']
submission['prediction'] = model.predict(test_x)
submission['prediction'] = submission['prediction'].astype('float')

# Save submission
submission.to_csv('submission.csv', index=False)

In [None]:
# Download zip file
from zipfile import ZipFile
from google.colab import files

with ZipFile('submission.zip', 'w') as myzip:
  myzip.write('submission.csv')

files.download('submission.zip')


# Results Investigation

In [None]:
from sklearn.metrics import confusion_matrix

threshold = 0.5

cm = confusion_matrix(y_true=val_y, y_pred=(model.predict(val_x)>0.5)*1)
cm_plot_labels = ['human','animal']

utils.plot_confusion_matrix(cm=cm, classes=cm_plot_labels, title=f"Confusion Matrix threshold={threshold}")

In [None]:
pred_val = (model.predict(val_x).flatten()>0.5)*1
df_idx = list(train_dff[train_dff.is_validation].index)
results_val = pd.DataFrame({'true':val_y, 'pred':pred_val,'df_idx':df_idx})


‘target_type’ –  ‘human’ (1) or ‘animal’ (0) - the identified object in the segment.

In [None]:
# true=0, pred=0 => GOOD: animals identified correctly as animals

print("GOOD: animals identified correctly as animals")

num_results =10

fig, axarr = plt.subplots(1, num_results,figsize=(2*num_results, 6*num_results))

for i in range(num_results):
  ax1= axarr[i]
  sample = results_val[(results_val.true==0) & (results_val.pred==0)]
  ind = sample.iloc[i].df_idx
  x_tmp = train_df['iq_sweep_burst'][ind]
  utils.plot_spectrogram(
      train_df['iq_sweep_burst'][ind],
      train_df['doppler_burst'][ind], 
      color_map_path='./data/cmap.npy',
      ax=ax1
      )

fig.tight_layout()




In [None]:
# true=1, pred=0 => WRONG: humans mis-identified as animals

print("WRONG: humans mis-identified as animals")

fig, axarr = plt.subplots(1, num_results,figsize=(2*num_results, 6*num_results))
#fig.suptitle("WRONG: humans mis-identified as animals", fontsize=16)

for i in range(num_results):
  ax1= axarr[i]
  sample = results_val[(results_val.true==1) & (results_val.pred==0)]
  ind = sample.iloc[i].df_idx
  x_tmp = train_df['iq_sweep_burst'][ind]
  utils.plot_spectrogram(
      train_df['iq_sweep_burst'][ind],
      train_df['doppler_burst'][ind], 
      color_map_path='/content/cmap.npy',
      ax=ax1
      )
  
fig.subplots_adjust(top=0.88)
fig.tight_layout()
plt.show()

In [None]:
# true=0, pred=1 => WRONG: animals mis-identified as humans

print("WRONG: animals mis-identified as humans")

fig, axarr = plt.subplots(1, num_results,figsize=(2*num_results, 6*num_results))
#fig.suptitle("WRONG: animals mis-identified as humans", fontsize=16)

for i in range(num_results):
  ax1= axarr[i]
  sample = results_val[(results_val.true==0) & (results_val.pred==1)]
  ind = sample.iloc[i].df_idx
  x_tmp = train_df['iq_sweep_burst'][ind]
  utils.plot_spectrogram(
      train_df['iq_sweep_burst'][ind],
      train_df['doppler_burst'][ind], 
      color_map_path='/content/cmap.npy',
      ax=ax1
      )
  
fig.subplots_adjust(top=0.88)
fig.tight_layout()
plt.show()

In [None]:
# true=1, pred=1  => GOOD: humans identified correctly as humans.

print("GOOD: humans identified correctly as humans")

rows_results = 4
num_results=40

fig, axarr = plt.subplots(rows_results, 
                          int(num_results/rows_results),
                          figsize=( 20, 5*rows_results )
)
#fig.suptitle("GOOD: humans identified correctly as humans", fontsize=16)

for i in range(num_results):
  ax1= axarr[int(i%rows_results),int(i/rows_results)]
  sample = results_val[(results_val.true==1) & (results_val.pred==1)]
  ind = sample.iloc[i].df_idx
  x_tmp = train_df['iq_sweep_burst'][ind]
  utils.plot_spectrogram(
      train_df['iq_sweep_burst'][ind],
      train_df['doppler_burst'][ind], 
      color_map_path='/content/cmap.npy',
      ax=ax1
      )
  
fig.subplots_adjust(top=0.88)
fig.tight_layout()
plt.show()

## more error analysis

checking the distribution of the errors

In [None]:
train_dff['success'] = None
for _, row in results_val.iterrows():
    i = row.df_idx
    train_dff.loc[i,'success']= (row.true==row.pred)

In [None]:
mistakes = train_dff[(train_dff.is_validation==True) & (train_dff.success==False)]
mistakes.head()

In [None]:
mistakes.snr_type.value_counts()

In [None]:
mistakes.target_type.value_counts()

In [None]:
mistakes.track_id.value_counts()

In [None]:
track_id_t = 11
train_dff[train_dff.track_id==track_id_t]

In [None]:
segment0_in_track = train_dff[train_dff.track_id==track_id_t].iloc[0].segment_id
plt.figure(figsize=( 20,40))

animal0_human1 = train_dff[train_dff.track_id==track_id_t].iloc[0].target_type
class_str = "Human" if animal0_human1 else "Animal"

utils.spectrogram(train_df, segment_id=segment0_in_track, plot_track=True, snr_plot='both',
            color_map_path='./data/cmap.npy',title=f"track #{track_id_t}. All SNRs. Class={class_str}",
            val_overlay=list(train_dff[train_dff.track_id==track_id_t].success) )

## clustering

In [None]:
holder = model.predict(train_x)
train_preds = np.empty( holder.shape )
for idx,val in enumerate(holder):
  if val[0]<0.5:
    train_preds[idx][0] = 0
  else:
    train_preds[idx][0] = 1

In [None]:
utils.make_tsne(model,train_x, train_y, train_preds, np.empty((0,0)))


In [None]:
utils.make_tsne(model,train_x, train_y, train_preds, test_x)


---

# Creating Inner test set (+ check results)


## Spliting the data


In [None]:
# Training set
train_path = 'MAFAT RADAR Challenge - Training Set V1'
training_df = utils.load_data(train_path, folder=data_path )
print(f"real dataset({training_df['date_index'].shape})",end='')

# Adding segments from the experiment auxiliary set to the training set
train_df = append_dict(training_df, train_aux)
total_number_samples = train_df['date_index'].shape[0]
print(f" + aux dataset({train_aux['date_index'].shape}) = full train({total_number_samples})")

# Preprocessing and split the data to training and validation
train_df = utils.data_preprocess(train_df.copy())
train_x, train_y, valtest_x, valtest_y, valtest_ar = utils.split_train_val(train_df)

valtest_ar_idx = list(np.where(valtest_ar==True)[0])
val_idx,test_idx,mask = utils.splitArrayBy(valtest_ar_idx,[2,1])

is_validation_ar = np.array([False]*total_number_samples)
is_validation_ar[list(val_idx)]=True
is_test_ar = np.array([False]*total_number_samples)
is_test_ar[list(test_idx)]=True

val_y = valtest_y[mask==0]
val_x = valtest_x[mask==0,:]
test_y = valtest_y[mask==1]
test_x = valtest_x[mask==1,:]

train_df_t = train_df.copy()
del train_df_t['doppler_burst']
del train_df_t['iq_sweep_burst']
train_dff = pd.DataFrame(train_df_t)
train_dff['is_validation']=is_validation_ar
train_dff['is_test']=is_test_ar

val_y =  val_y.astype(int)
train_y =train_y.astype(int)
test_y =test_y.astype(int)
train_x = train_x.reshape(list(train_x.shape)+[1])
val_x = val_x.reshape(list(val_x.shape)+[1])
test_x = test_x.reshape(list(test_x.shape)+[1])

print(f"train:{train_x.shape[0]}")
print(f"val:{val_x.shape[0]}")
print(f"test:{test_x.shape[0]}")

In [None]:
train_dff

In [None]:
# Model configuration:
batch_size = 16
img_width, img_height = 126, 32
loss_function = BinaryCrossentropy()
no_epochs = 10
optimizer = Adam(learning_rate = 0.001)
input_shape = (img_width, img_height, 1)

init = tf.keras.initializers.GlorotNormal(seed = 0)

# Creating and running the model
model = create_model(input_shape, init)  
model.compile(loss=loss_function, optimizer=optimizer, metrics=[AUC(), 'accuracy'])


In [None]:
model.summary()

In [None]:
# Model fit

callbacks = []
if WANDB_enable:
  wandb.init(project="mafat",name="first")
  callbacks.append(WandbCallback())

history = model.fit(train_x, train_y, batch_size = batch_size, epochs = no_epochs, 
                    validation_data = (val_x, val_y), callbacks=callbacks)

## Results

In [None]:
# Plot ROC curve and show ROC-AUC results of the training and validation sets. 
pred = [model.predict(train_x), model.predict(val_x)]
actual = [train_y, val_y]
utils.stats(pred, actual)

In [None]:
pred = [model.predict(train_x), model.predict(test_x)]
actual = [train_y, test_y]
utils.stats(pred, actual, mode="Test")