# This notebook is helpful for tracking the training process when train_dupnet.ipynb is running.

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sdcdup.features.image_features import SDCImageContainer
from sdcdup.utils import create_dataset_from_tiles
from sdcdup.utils import reversed_recombined_holt_winters

%matplotlib inline
%reload_ext autoreload
%autoreload 2

SMALL_SIZE = 10
MEDIUM_SIZE = 12
BIGGER_SIZE = 16
BIGGEST_SIZE = 20
plt.rc('font', size=BIGGEST_SIZE)         # controls default text sizes
plt.rc('axes', titlesize=BIGGEST_SIZE)    # fontsize of the axes title
plt.rc('axes', labelsize=BIGGEST_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=BIGGER_SIZE)   # fontsize of the tick labels
plt.rc('ytick', labelsize=BIGGER_SIZE)   # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)   # legend fontsize
plt.rc('figure', titlesize=BIGGEST_SIZE)  # fontsize of the figure title

# SENDTOENV
train_image_dir = 'data/raw/train_768/'
image_md5hash_grids_file = 'data/interim/image_md5hash_grids.pkl'
image_bm0hash_grids_file = 'data/interim/image_bm0hash_grids.pkl'
image_cm0hash_grids_file = 'data/interim/image_cm0hash_grids.pkl'
image_greycop_grids_file = 'data/interim/image_greycop_grids.pkl'
image_entropy_grids_file = 'data/interim/image_entropy_grids.pkl'
image_issolid_grids_file = 'data/interim/image_issolid_grids.pkl'
image_shipcnt_grids_file = 'data/interim/image_shipcnt_grids.pkl'

In [None]:
full_dataset_filename = 'data/processed/full_SDC_dataset_from_tiles.csv'
if os.path.exists(full_dataset_filename):
    df = pd.read_csv(full_dataset_filename)
    full_dataset = list(zip(*[df[c].values.tolist() for c in df]))
else:
    sdcic = SDCImageContainer()
    sdcic.preprocess_image_properties(
        image_md5hash_grids_file,
        image_bm0hash_grids_file,
        image_cm0hash_grids_file,
        image_greycop_grids_file,
        image_entropy_grids_file,
        image_issolid_grids_file)
    full_dataset = create_dataset_from_tiles(sdcic)

print(len(full_dataset))

In [None]:
nrows = 1
ncols = 2
fname = 'dup_model.2019_0727_2246.metrics.csv'
fname = 'dup_model.2019_0730_1923.metrics.csv'
fname = 'dup_model.2019_0802_2209.metrics.csv'
df_stats = pd.read_csv('models/' + fname)

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(14, 6))
# span = 111//30 = 3
# span = epochs // 100
span = 2
field_ylim = {
    'loss': (0, 0.005), 
    'brier_loss': (0, None),
    'focal_loss': (0, None), 
    'focal_loss_1': (0, 5), 
    'focal_loss_2': (0, 5), 
    'dice_coef_loss': (0, None), 
    'soft_dice_loss': (0, None),
    'binary_crossentropy': (0, None),
    'acc': (None, 1), 
    'iou': (None, 1), 
    'mean_iou': (None, 1), 
    'f2': (None, 1), 
    'fbeta': (None, 1), 
    'soft_dice_coef': (None, 1),
    'hard_dice_coef': (None, 1), 
    'hard_dice_coef2': (None, 1), 
    'hard_dice_coef_ch1': (None, 1),
} 
stats_fields = ('loss', 'acc')
filename = []
for j in range(ncols):
    field = stats_fields[j]
#     filename.append(field)

    stats_dict = {
        'train_'+field: df_stats['train_'+field],
        'val_'+field: df_stats['val_'+field],
        'train_'+field+' (hw)': reversed_recombined_holt_winters(np.array(df_stats['train_'+field]), span=span),
        'val_'+field+' (hw)': reversed_recombined_holt_winters(np.array(df_stats['val_'+field]), span=span)
    }
    alphas = {
        'train_'+field: 0.3,
        'val_'+field: 0.3,
        'train_'+field+' (hw)': 1,
        'val_'+field+' (hw)': 1
    }
    ax = axes[j]
    legend_labels = []
    for key, value in stats_dict.items():
        ax.plot(value, alpha=alphas[key])
        legend_labels.append(key)
    # Special case for f2 callback.
    if field == 'fbeta' and 'val_f2' in df_stats:
        ax.plot(df_stats['val_f2'])
        legend_labels.append('val_f2')
    ax.set_title(field)
    ax.set_xlabel(r'epoch')
    ax.set_xlim((0, len(df_stats['epoch'])))
    ax.set_ylim(field_ylim[field])
    ax.grid(True)
    ax.legend(legend_labels)

plt.tight_layout()
# filename = '-'.join(filename)
# plt.savefig(out_dir + filename + '.png')
plt.show()


In [None]:
nrows = 1
ncols = 2

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(14, 6))

ignored_runs = [
    "dup_model.2019_0709_0149.metrics.csv",
    "dup_model.2019_0714_0200.metrics.csv",
    "dup_model.2019_0720_0155.metrics.csv",
    "dup_model.2019_0724_0938.metrics.csv",
    "dup_model.2019_0727_2246.metrics.csv",
#     "dup_model.2019_0730_1923.metrics.csv",
    "dup_model.2019_0731_2310.metrics.csv",
    "dup_model.2019_0801_0954.metrics.csv",
    "dup_model.2019_0801_2010.metrics.csv",
#     "dup_model.2019_0802_0708.metrics.csv",
    "dup_model.2019_0805_0015.metrics.csv",
]
csv_files = []
for fname in os.listdir('models'):
    if fname.endswith('metrics.csv') and fname not in ignored_runs:
        csv_files.append(fname)

legend_labels = []
dataframes = []
for fname in sorted(csv_files):
    dataframes.append(pd.read_csv('models/' + fname))
    legend_labels.append(fname.split('.')[-3])

xlabels = ('epoch', 'time')
xmax = (100, 100000)
for j in range(ncols):
#     xmax = 0
    xlabel = xlabels[j]
    ax = axes[j]
    for df in dataframes:
#         value = df.train_loss
#         ax.plot(df[xlabel], value)
#         value = reversed_recombined_holt_winters(np.array(df.val_loss), span=2)
        value = df.val_loss
        ax.plot(df[xlabel], value)
#         xmax = max(xmax, max(df[xlabel]))

    ax.set_title('val loss')
    ax.set_xlabel(xlabel)
    ax.set_xlim((0, xmax[j]))
    ax.set_ylim((0, 0.005))
#     ax.grid(True)
    ax.legend(legend_labels)

plt.tight_layout()
plt.show()


In [None]:
fname = "dup_model.2019_0802_0708.56.avl.csv"
df_avl = pd.read_csv('models/' + fname)

In [None]:
df_avl.describe()

In [None]:
df_avl['ages'].hist(bins=df_avl['ages'].max()//100);

In [None]:
df_avl['visits'].hist(bins=df_avl['visits'].max());

In [None]:
run_name = "2019_0802_2209"
csv_files = []
for fname in os.listdir('models'):
    if fname.endswith('avl.csv') and run_name in fname:
        csv_files.append(fname)

# dataframes = []
for fname in sorted(csv_files):
    model_name, date_time, epoch, val_loss0, val_loss1, ds_type, file_type = fname.split('.')
    val_loss = '.'.join([val_loss0, val_loss1])
    df = pd.read_csv('models/' + fname)
    print(f"{int(epoch):>2} | {df.ages.max():>5}, ({df.visits.min()}, {df.visits.max():>2}), {df.losses.max():>7.4f}, {df.losses.sum():>12.2f}, {float(val_loss):<8.6f}")
#     dataframes.append((int(epoch), df))