##Viewing evaluation finetuning curves

For viewing finetuning evaluation results by graphing metrics versus time using matplotlib.

###General config/authenticate GCS and mount drive if needed

In [1]:
import os
import numpy as np
import json
import matplotlib.pyplot as plt
import math
import re
from tqdm import tqdm
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
import shutil
import tensorflow.compat.v1 as tf

#@markdown For GCS pathes, what the name of the bucket is:
BUCKET_NAME = "theodore_jiang" #@param {type:"string"}
BUCKET_PATH = "gs://"+BUCKET_NAME
#@markdown Folder source where evaluation result files have been stored (should point to the EVALUATIONS_DIR variable in the evaluation/prediction script) (can be either a GCS path or a drive path, depending on where the evaluation results were written):
SOURCE_PATH = "gs://theodore_jiang/MutFormer_updated_finetuning_eval_results" #@param {type:"string"}
DEST_PATH = SOURCE_PATH.replace(BUCKET_PATH+"/","")

if "gs://" in SOURCE_PATH:
  from google.colab import auth
  print("Authenticate for GCS:")
  auth.authenticate_user()
elif "/content/drive" in SOURCE_PATH: 
  from google.colab import drive
  print("Mount drive:")
  drive.mount('/content/drive', force_remount=True)
  DRIVE_PATH="/content/drive/My Drive"


TensorFlow 1.x selected.
Authenticate for GCS:


###Download and combine tfevent files into dictionaries

Downloads and converts data into dictionary format to be used for graphing in the code segment below the following code segment. To avoid tfevent file clutter (loading tfevent files is also both expensive and slow), this file will delete the original tfevent files and create a json dictionary to take their place.

In [None]:
def tabulate_event(fpath):
  stuff = {}
  
  ea = EventAccumulator(fpath).Reload()
  tags = ea.Tags()['scalars']

  for tag in tags:
    for event in ea.Scalars(tag):
      try:
          stuff[tag].append((event.step,event.value))
      except:
          stuff[tag] = [(event.step,event.value)]
  return stuff

if os.path.exists(DEST_PATH): ##before downloading, clear the destination
  shutil.rmtree(DEST_PATH)
os.makedirs(DEST_PATH)
if "gs://" in SOURCE_PATH:                ##download tfevent files into local system for processing
  cmd = "gsutil -m rsync -r "+SOURCE_PATH+" "+DEST_PATH
  !{cmd}
  cmd = "gsutil -m rm -r "+SOURCE_PATH
  !{cmd}
else:
  shutil.copytree(SOURCE_PATH,DEST_PATH)
  shutil.rmtree(SOURCE_PATH)
  os.makedirs(SOURCE_PATH)

graph_datas = {}
for run in os.listdir(DEST_PATH):          ##assumes each folder comtains multiple subfolders, with each folder denoting 
  runp = DEST_PATH+"/"+run                 ##a single run. Generates a different set of data to be graphed for each run.
  try:
    run_data = json.load(open(DEST_PATH+"/"+run+"/compiled_data.json"))
  except:
    run_data = {}
  for path,dirs,files in os.walk(runp):
    for file in files:
      subrun = path.replace(runp+"/","")
      if not subrun in run_data.keys():
        run_data[subrun] = {}
      filep = path+"/"+file
      if "tfevents" not in filep:
        continue
      metrics = tabulate_event(filep)
      for k,v in metrics.items():
        for event_data in v:
          try:
            if not event_data in run_data[subrun][k]:
              run_data[subrun][k].append(event_data)
          except:
            run_data[subrun][k] = [event_data]
  graph_datas[run] = run_data
  json.dump(run_data,tf.gfile.Open(SOURCE_PATH+"/"+run+"/compiled_data.json","w+")) ##upload a json to take the place of many tfevent files

###Plotting smoothed average curves using matplotlib

In [None]:
#@markdown Range of the local average for viewing training graphs (amount of steps to average into one datatpoint) (to disable local averaging, set it to 0):
avg_range = 100 #@param {type:"integer"}
#@markdown Whether or not to save graphs into files:
save_figs = True #@param {type:"boolean"}
#@markdown * If saving figs, destination path for saving them (can be either a drive path or local path):
outfolder = "evaluation_graphs" #@param {type:"string"}

if "/content/drive" in outfolder: 
  from google.colab import drive
  print("Mount drive:")
  drive.mount('/content/drive', force_remount=True)
  DRIVE_PATH="/content/drive/My Drive"

for run,run_data in graph_datas.items():
    print("\n\nGraphs for run:",run,"\n\n")
    graphs = {}
    for subrun,subrun_data in run_data.items():
      for metric,data in subrun_data.items():
        try:
          graphs[metric][subrun] = data
        except:
          graphs[metric] = {subrun:data}
    for metric,metric_datas in graphs.items():
      plt.figure(figsize=(10,5))
      plt.title(metric+" graph")
      plt.xlabel("steps")
      plt.ylabel(metric)
      for subrun,data in metric_datas.items():
        steps = []
        values = []
        nan = 0
        for datapt in data:
          step = int(float(datapt[0]))
          value = float(datapt[1])
          if not math.isnan(value):
              values.append(value)
              steps.append(step)
          else:
              nan+=1
        print("found and deleted",nan,"nan values in subrun:",subrun)
        steps_values_sorted = sorted(zip(steps, values), key=lambda pair: pair[0])
        steps = [x for x,_ in steps_values_sorted]
        values = [x for _,x in steps_values_sorted]

        avged_values = []
        for step,value in steps_values_sorted:
          min_value_in_range = step-avg_range
          max_value_in_range = step+avg_range
          values_within_avg_range = [value for step,value in steps_values_sorted if min_value_in_range<=step<=max_value_in_range]
          avged_values.append(sum(values_within_avg_range)/len(values_within_avg_range))
        plt.plot(steps,avged_values,label=subrun)
                
      plt.legend()
      if save_figs:
        figout_folder = outfolder+"/"+run
        if not os.path.exists(figout_folder):
          os.makedirs(figout_folder)
        plt.savefig(figout_folder+"/"+metric.replace("/","_")+".png")
      plt.show()


###Tensorboard viewing (If you wish to use tensorboard instead)

In [None]:
LOGS_DIR = "/content/drive/My Drive" #@param (type:"string")
LOGS_DIR = "\""+LOGS_DIR+"\""
%load_ext tensorboard
%tensorboard --logdir $LOGS_DIR

##Predictions Processing

###General config/authenticate GCS and mount drive if needed

In [None]:
import os
import numpy as np
import json
import matplotlib.pyplot as plt
import math
import re
from tqdm import tqdm
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
import shutil
import tensorflow.compat.v1 as tf
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

#@markdown For GCS pathes, what the name of the bucket is:
BUCKET_NAME = "theodore_jiang" #@param {type:"string"}
BUCKET_PATH = "gs://"+BUCKET_NAME
#@markdown Folder source where predictions have been stored (should point to the PREDICTIONS_DIR variable in the evaluation/prediction script) (can be either a GCS path or a drive path, depending on where the predictions were written):
SOURCE_PATH = "gs://theodore_jiang/MutFormer_updated_finetuning_predictions" #@param {type:"string"}
DEST_PATH = SOURCE_PATH.replace(BUCKET_PATH+"/","")

if "gs://" in SOURCE_PATH:
  from google.colab import auth
  print("Authenticate for GCS:")
  auth.authenticate_user()
elif "/content/drive" in SOURCE_PATH: 
  from google.colab import drive
  print("Mount drive:")
  drive.mount('/content/drive', force_remount=True)
  DRIVE_PATH="/content/drive/My Drive"


Authenticate for GCS:


###Transfer predictions(optional)

If desired, prediction results can be copied to a new path, and processed from there (useful for downloading files into drive from GCS).

In [None]:
#@markdown Where to write the predictions into (can be a drive path, in which case drive will be mounted if not mounted already)
DESTINATION_PATH = "/content/drive/My Drive/MutFormer_updated_finetuning_predictions" #@param{type:"string"}

if "/content/drive" in DEST_PATH: 
  from google.colab import drive
  print("Mount drive:")
  drive.mount('/content/drive', force_remount=True)
  DRIVE_PATH="/content/drive/My Drive"

def tabulate_event(fpath):
  stuff = {}
  
  ea = EventAccumulator(fpath).Reload()
  tags = ea.Tags()['scalars']

  for tag in tags:
    for event in ea.Scalars(tag):
      try:
          stuff[tag].append((event.step,event.value))
      except:
          stuff[tag] = [(event.step,event.value)]
  return stuff

if "gs://" in SOURCE_PATH:                ##download tfevent files into local system for processing
  cmd = "gsutil -m rsync -r "+SOURCE_PATH+" "+DESTINATION_PATH
  !{cmd}
elif "/content/drive" in SOURCE_PATH: 
  shutil.copytree(SOURCE_PATH,DESTINATION_PATH)

SOURCE_PATH = DESTINATION_PATH

###Download files into local system

In [None]:
if os.path.exists(DEST_PATH): ##before downloading, clear the local destination
  shutil.rmtree(DEST_PATH)
os.makedirs(DEST_PATH)
if "gs://" in SOURCE_PATH:                ##download tfevent files into local system for processing
  cmd = "gsutil -m rsync -r "+SOURCE_PATH+" "+DEST_PATH
  !{cmd}
else: 
  shutil.copytree(SOURCE_PATH,DEST_PATH)

###Convert tfevents into txts (If used EVALUATE_WHILE_PREDICT)

If during prediction, the EVALUATE_WHILE_PREDICT option was used, predictions will be written in the form of tfevent files. This script will convert these tfevent files into txts (There is no need to run this code segment if EVALUATE_WHILE_PREDICT was not used during prediction).

In [None]:
def tabulate_event(fpath):
  stuff = {}
  
  ea = EventAccumulator(fpath).Reload()
  tags = ea.Tags()['scalars']

  for tag in tags:
    for n,event in enumerate(ea.Scalars(tag)):
      try:
          stuff[tag].append((event.step,n,event.value))
      except:
          stuff[tag] = [(event.step,n,event.value)]
  return stuff


for run in os.listdir(DEST_PATH):          ##assumes each folder comtains multiple subfolders, with each folder denoting 
  runp = DEST_PATH+"/"+run                 ##a single run. Generates a different set of data to be graphed for each run.
  run_data = {}
  for path,dirs,files in os.walk(runp):
    for file in files:
      subrun = path.replace(runp+"/","")
      if not subrun in run_data.keys():
        run_data[subrun] = {}
      filep = path+"/"+file
      if "tfevents" not in filep:
        continue
      preds_data = tabulate_event(filep)
      for tag,v in preds_data.items():
        tag=re.sub("\_\d+$","",tag)
        for event_data in v:
          try:
            if not event_data in run_data[subrun][tag]:
              run_data[subrun][tag].append(event_data)
          except:
            run_data[subrun][tag] = [event_data]
  for subrun,subrun_data in run_data.items():
    predp = DEST_PATH+"/"+run+"/"+subrun+"/predictions.txt"
    lines = []
    for tag,data in subrun_data.items():
      sorted_data = [x for x in sorted(sorted(data,key=lambda x:x[0]),key=lambda x:x[1])]
      for d,dp in enumerate(sorted_data):
        try:
          lines[d]+="\t"+tag+":"+str(dp[2])
        except:
          lines.append(tag+":"+str(dp[2]))
    if len(lines)>0:
      open(predp,"w+").write("\n".join(lines))

###Plot ROC Curves using txts

In [None]:
#@markdown Whether or not to save ROC curves into files:
save_figs = True #@param {type:"boolean"}
#@markdown * If saving figs, destination path for saving them (can be either a drive path or local path):
outfolder = "ROC_graphs" #@param {type:"string"}

def str2list(string):
    string = string.strip("[]").replace(","," ")
    return string.split()

if "/content/drive" in outfolder: 
  from google.colab import drive
  print("Mount drive:")
  drive.mount('/content/drive', force_remount=True)
  DRIVE_PATH="/content/drive/My Drive"

AUCs = {}

for run in os.listdir(DEST_PATH):          ##assumes each folder comtains multiple subfolders, with each folder denoting 
  runp = DEST_PATH+"/"+run                 ##a single run. Generates a different set of data to be graphed for each run.
  plt.figure(figsize=(20,10))
  for path,dirs,files in os.walk(runp):
    for file in files:
      subrun = path.replace(runp+"/","")
      filep = path+"/"+file
      if "predictions" not in file:
        continue
      labels = []

      pred_probs = []
      tp=0
      tn=0
      fp=0
      fn=0
      print("Stats for (run/subrun):",run+"/"+subrun,"\n")

      for n,line in enumerate(open(filep).read().split("\n")[:-1]):
        line_dict = {}
        try:
          for item in line.split("\t"):
              line_dict[item.split(":")[0]] = item.split(":")[1]
          label = float(line_dict["labels"])
          pred = float(str2list(line_dict["probabilities"])[1])
          if label == 1 and pred>0.5:
            tp+=1
          elif label == 0 and pred<0.5:
            tn+=1
          elif label == 0 and pred>0.5:
            fp+=1
          elif label == 1 and pred<0.5:
            fn+=1
          else:
            continue ##probably invalid input, so slip
          pred_probs.append(pred)
          labels.append(label)
        except Exception as e:
          print("failed at line",n, "error:",e)
          print("full failed line:",line,"\n")


      print("tp:",tp,
            "tn:",tn,
            "fp:",fp,
            "fn:",fn)
      
      try:
        acc = (tp+tn)/(tp+tn+fp+fn)
        recall = tp/(tp+fn)
        precision = tp/(tp+fp)
        f1 = 2*precision*recall/(precision+recall)
        print("acc:",acc)
        print("recall_total:",recall)
        print("precision_total:",precision)
        print("f1_total:",f1,"\n")
      except:
        pass
      
      ##calculate roc auc
      pred_auc = roc_auc_score(labels, pred_probs)


      labels = labels[:min(len(labels),len(pred_probs))]        ##trims both lists to the same length
      pred_probs = pred_probs[:min(len(labels),len(pred_probs))]
      print("Graphing ROC curve for",len(labels),"predictions")
      pred_fpr, pred_tpr, _ = roc_curve(labels, pred_probs)
      plt.plot(pred_fpr, pred_tpr, linestyle="-", label=subrun+': Area under curve: '+str(round(pred_auc,3)))
      plt.xlabel('False Positive Rate')
      plt.ylabel('True Positive Rate')
      AUCs[run+"/"+subrun] = round(pred_auc,3)

  plt.legend()
  plt.title("ROC for run: "+run)
  if save_figs:
    if not os.path.exists(outfolder):
      os.makedirs(outfolder)
    plt.savefig(outfolder+"/"+run+"_ROC.png")
  plt.show()


AUCs = {k:v for k,v in sorted([(k,v) for k,v in AUCs.items()],key=lambda x:x[1])}
print("Printing all AUCs...")
for k,v in AUCs.items():
  print("run/subrun:",k,"\tAUC:",v)