In [None]:
#Evaluate Machine Learning Models and Function
#Disease Biophysics Group
#Written by John Zimmerman
#Updated 4/10/22

%matplotlib inline
import tensorflow as tf
import os
import numpy as np
import math 
import cv2
import pandas as pd
import h5py
from tqdm import *
from scipy.spatial import ConvexHull
from itertools import product
import scipy.stats

import GeoSwimmer
import SwimNN
from tensorflow.python.framework.ops import disable_eager_execution
from scipy.interpolate import UnivariateSpline

from matplotlib.pyplot import *
%matplotlib inline
from matplotlib.colors import ListedColormap, LinearSegmentedColormap

print(tf.__version__)
tf.compat.v2.keras.backend.clear_session()

In [None]:
def normcdf(x, mu, sigma):
    t = x-mu;
    y = 0.5*erfcc(-t/(sigma*sqrt(2.0)));
    if y>1.0:
        y = 1.0;
    return y

def updateModelLabels(df,model,chunksize=4, dualloss = False):
    splits = int(np.round(df.DNA.size/chunksize))
    breakList = list(SwimNN.SwimSearch.chunks(np.arange(df.DNA.size),splits))
    
    modelLabel = np.zeros(df.DNA.size)
    print('Updating Labels')
    for breaks in breakList:
        print(f'{(breaks[-1]/df.DNA.size)*100:.2f}%...')
        dfchunk = df[breaks[0]:breaks[-1]+1]
        OH_df = SwimNN.NN.OneHotEncodeList(dfchunk)
        if dualloss:
            #modelLabel[breaks[0]:breaks[-1]+1] = model(OH_df[breaks[0]:breaks[-1]+1],training=False)[0].numpy().reshape(-1) #Multiloss
            modelLabel[breaks[0]:breaks[-1]+1] = model(OH_df,training=False)[0].numpy().reshape(-1)
        else:
            modelLabel[breaks[0]:breaks[-1]+1] = model(OH_df,training=False).numpy().reshape(-1)
            #modelLabel[breaks[0]:breaks[-1]+1] = model(OH_df[breaks[0]:breaks[-1]+1],training=False).numpy().reshape(-1)

    return modelLabel

In [None]:

#Load File Paths
filepath = 'G:\\...\\'
filename = 'results_updated.pkl'
df = pd.read_pickle(filepath+filename)


In [None]:
train = df[~df['SimNetU'].isna()].sample(1500)
sim = df[~df['SimNetU'].isna()]
validate = sim[~sim.index.isin(train.index)]

In [None]:
#Model validation on swimmer data set, and looking for top ranked swimmers for final iteration
#Update NN Model
dnaLength = 6
lr=0.001
epsilon=1e-07
epochsperloop = 8000
updatechunks = 32
number = 1200
loadweights = False


tf.keras.backend.clear_session()


model = SwimNN.NN.Hessian_DualLoss_refine(dnaLength)

#loadweights = os.path.exists(filepath+'model.h5')

if loadweights == False:
    print('No Model Weights found...proceeding with fresh weights') 
else:
    print('Loading Model weights...')
    model.load_weights(filepath+'model.h5') 
    print('Model Loaded')


train = df[df["SimNetU"].notna()].sample(number)
sim = df[~df['SimNetU'].isna()]
validate = sim[~sim.index.isin(train.index)]
print(f'Train min Value: {train["SimNetU"].min()}')
print(f'Train Max Value: {train["SimNetU"].max()}')

#Estimate CDF of Training Samples
train_vels = np.array(train["SimNetU"].to_list()) #Convert of Array

print('Fitting CDF to array')
train['CDF'] =  scipy.stats.norm.cdf(train.SimNetU,np.mean(train.SimNetU),np.std(train.SimNetU))
validate['CDF'] = scipy.stats.norm.cdf(validate.SimNetU,np.mean(train.SimNetU),np.std(train.SimNetU))

#Generate training inputs for model
OH_train =tf.cast(SwimNN.NN.OneHotEncodeList(train),tf.float64)
train_label = tf.convert_to_tensor(train.CDF.to_numpy(),dtype=tf.float64)

#Prep Model
loss = tf.keras.losses.MeanAbsolutePercentageError()
model.compile(optimizer=SwimNN.NN.NNOptimizer(lr=lr,epsilon=epsilon),
          loss=(loss,loss),
          metrics=['accuracy'])

#Fit Model to data - Two Labels for Multiloss
model.fit(OH_train, (train_label,train_label), epochs=epochsperloop)


df['ModelLabel'] = updateModelLabels(df,model,chunksize=updatechunks, dualloss=True)
validate['ModelLabel'] = updateModelLabels(validate,model,chunksize=1, dualloss=True)

In [None]:
#finding middle of the road swimmers
count = 8

midsample = df[df["ModelLabel"]>0.49]
midsample = midsample[midsample["ModelLabel"]<0.51]

sortedDF = midsample.sort_values("ModelLabel")
length = np.floor(len(sortedDF.DNA)/2)
print(length)


minloc = int(length-count) 
maxloc = int(length+count)
sortedDF[minloc:maxloc]

In [None]:
#Graph energy landscape of the results
sample = df.sample(30000)
fig = SwimNN.RadarPlot.PlotRadarMesh(sample.RadX,sample.RadY,sample.ModelLabel,figheight=10,figwidth=10)
sim = df[~df['SimNetU'].isna()]
scatter(sim.RadX,sim.RadY,c="black",s=10,alpha=0.55)