In [None]:
from google import colab
colab.drive.mount('/content/gdrive')
from collections import defaultdict

import numpy as np
import torch
from glob import glob

from tqdm import tqdm 
from torch.utils.data import DataLoader,TensorDataset
import torch.optim as optim
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
from torchvision import io
from torchvision import datasets, transforms, models
from torchsummary import summary
import torchvision.models as models

import matplotlib.pyplot as plt
import seaborn as sns

import gc
import shutil
import tarfile
import os


In [1]:
data_path = '/content/gdrive/MyDrive/IDAO'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

params = {
    'batch_size': 32,
    'shuffle': True,
    'num_workers':2
    }
lr = 0.0001

NameError: ignored

# Load data

In [None]:
%%time

# Transfer data to the machine
shutil.copyfile(f'{data_path}/raw_data/track_1.tar', 'track_1.tar') 

my_tar = tarfile.open('track_1.tar')
my_tar.extractall('extract') # specify which folder to extract to
my_tar.close()

if not os.path.exists('/content/train'):
    os.mkdir('/content/train')

for img_ER in glob("/content/extract/idao_dataset/train/ER/*.png"):
  nrj = img_ER.split("_")[7]
  path = f'/content/train/{nrj}ER'
  if not os.path.exists(path):
    os.mkdir(path)
  shutil.move(img_ER, f"{path}/{img_ER.split('/')[-1]}")

for img_NR in glob("/content/extract/idao_dataset/train/NR/*.png"):
  nrj = img_NR.split("_")[8]
  path = f'/content/train/{nrj}NR'
  if not os.path.exists(path):
    os.mkdir(path)
  shutil.move(img_NR, f"{path}/{img_NR.split('/')[-1]}")

os.remove('track_1.tar')

## Splits creation

In [None]:
train_splits1 = [["6NR", "10ER", "20NR", "30ER"], ["1NR", "3ER", "20NR", "30ER"], ["1NR", "3ER", "6NR", "10ER"]]
val_splits1 = [["1NR", "3ER"], ["6NR", "10ER"], ["20NR", "30ER"]]

dict_splits1 = defaultdict(dict)
for i, (train, val) in enumerate(zip(train_splits1, val_splits1)):
  dict_splits1[i]["train"] = [glob(f"/content/train/{group}/*.png") for group in train]
  dict_splits1[i]["train"] = [item for sublist in dict_splits1[i]["train"] for item in sublist]
  dict_splits1[i]["val"] = [glob(f"/content/train/{group}/*.png") for group in val]
  dict_splits1[i]["val"] = [item for sublist in dict_splits1[i]["val"] for item in sublist]

train_splits2 = [["6NR", "3ER", "20NR", "30ER"], ["1NR", "10ER", "20NR", "30ER"], ["1NR", "3ER", "6NR", "10ER"]]
val_splits2 = [["1NR", "10ER"], ["6NR", "3ER"], ["20NR", "30ER"]]

dict_splits2 = defaultdict(dict)
for i, (train, val) in enumerate(zip(train_splits2, val_splits2)):
  dict_splits2[i]["train"] = [glob(f"/content/train/{group}/*.png") for group in train]
  dict_splits2[i]["train"] = [item for sublist in dict_splits2[i]["train"] for item in sublist]
  dict_splits2[i]["val"] = [glob(f"/content/train/{group}/*.png") for group in val]
  dict_splits2[i]["val"] = [item for sublist in dict_splits2[i]["val"] for item in sublist]

train_splits3 = [["6NR", "3ER", "20NR", "10ER"], ["1NR", "30ER", "20NR", "10ER"], ["1NR", "30ER", "6NR", "3ER"]]
val_splits3 = [["1NR", "30ER"], ["6NR", "3ER"], ["20NR", "10ER"]]

dict_splits3 = defaultdict(dict)
for i, (train, val) in enumerate(zip(train_splits3, val_splits3)):
  dict_splits3[i]["train"] = [glob(f"/content/train/{group}/*.png") for group in train]
  dict_splits3[i]["train"] = [item for sublist in dict_splits3[i]["train"] for item in sublist]
  dict_splits3[i]["val"] = [glob(f"/content/train/{group}/*.png") for group in val]
  dict_splits3[i]["val"] = [item for sublist in dict_splits3[i]["val"] for item in sublist]


train_splits4 = [["6NR", "3ER", "20NR", "10ER"], ["1NR", "3ER", "20NR", "10ER"], ["1NR", "10ER", "6NR", "3ER"]]
val_splits4 = [["1NR", "30ER"], ["6NR", "30ER"], ["20NR", "30ER"]]

dict_splits4 = defaultdict(dict)
for i, (train, val) in enumerate(zip(train_splits4, val_splits4)):
  dict_splits4[i]["train"] = [glob(f"/content/train/{group}/*.png") for group in train]
  dict_splits4[i]["train"] = [item for sublist in dict_splits4[i]["train"] for item in sublist]
  dict_splits4[i]["val"] = [glob(f"/content/train/{group}/*.png") for group in val]
  dict_splits4[i]["val"] = [item for sublist in dict_splits4[i]["val"] for item in sublist]

# Torch Dataset

In [None]:
class Dataset(torch.utils.data.Dataset):
  'Characterizes a dataset for PyTorch'
  def __init__(self, paths, train, rot=False, flip=False, gblur=-1):
    'Initialization'
    self.paths = paths
    self.train = train
    self.rot = rot
    self.flip = flip 
    self.gblur = gblur

  def __len__(self):
    'Denotes the total number of samples'
    return len(self.paths)

  def __getitem__(self, index):
    'Generates one sample of data'
    # Select sample
    ID = self.paths[index]
    # Load data and get label
    X = io.read_image(ID)
    X = transforms.ConvertImageDtype(torch.float32).forward(X)
    
    if self.train :
      if self.rot:
        X = transforms.RandomRotation(180).forward(X)
      X = transforms.CenterCrop(256).forward(X)
      if self.flip:
        X = transforms.RandomHorizontalFlip(p=.5).forward(X)
        X = transforms.RandomVerticalFlip(p=.5).forward(X)
      if self.gblur>0:
        X = transforms.GaussianBlur(3, sigma=(0.01, self.gblur)).forward(X)
    else :
      X = transforms.CenterCrop(256).forward(X)

    classif_label = 1 if ID.split("/")[3][-2:] == "ER" else 0
    regression_label = float(ID.split("/")[3][:-2])
    
    y = (classif_label, regression_label)

    return X, y

# Train utils

### Fit / Validation

In [None]:
def fit_kev(model, criterion_kev, data_loader, device, optimizer):
  running_kev_loss = 0.
  model.train()

  for X, (_, y_kev) in tqdm(data_loader, total=int(len(data_loader.dataset)/data_loader.batch_size), position=0):
    X = X.to(device)
    y_kev =  y_kev.reshape(-1, 1).type(torch.float).to(device)
    
    optimizer.zero_grad()
    pred_kev = model(X)

    kev_loss = criterion_kev(pred_kev, y_kev)

    running_kev_loss += kev_loss
    loss.backward()
    optimizer.step()

  train_kev_loss = running_kev_loss / len(data_loader.dataset)
  return train_kev_loss



def fit_kev_l2(model, criterion_kev, data_loader, device, optimizer):
  running_loss = 0.
  running_kev_loss = 0.
  model.train()

  for X, (_, y_kev) in tqdm(data_loader, total=int(len(data_loader.dataset)/data_loader.batch_size), position=0):
    X = X.to(device)
    y_kev =  y_kev.reshape(-1, 1).type(torch.float).to(device)
    
    optimizer.zero_grad()
    pred_kev = model(X)
    
    l2_reg = Variable(torch.FloatTensor(1), requires_grad=True)
    l2_reg = l2_reg.to(device)
    for W in model.parameters():
        l2_reg = l2_reg + W.norm(2)

    kev_loss = criterion_kev(pred_kev, y_kev)

    loss =   kev_loss + 0.01*l2_reg
    running_loss += loss.item()
    running_kev_loss += kev_loss
    loss.backward()
    optimizer.step()

  train_loss = running_loss / len(data_loader.dataset)
  train_kev_loss = running_kev_loss / len(data_loader.dataset)
  return train_kev_loss

def validate_kev(model, criterion_kev, data_loader, device):
  running_loss = 0.
  running_kev_loss = 0.
  model.eval()

  list_kev_true = []
  list_kev_pred = []

  with torch.no_grad():
    for X, (_,y_kev ) in data_loader:
      X = X.to(device)
      y_kev =  y_kev.reshape(-1, 1).type(torch.float).to(device)

      pred_kev = model(X)
      loss = criterion_kev(pred_kev, y_kev)

      list_kev_true += y_kev.cpu().detach().numpy().ravel().tolist()
      list_kev_pred += pred_kev.cpu().detach().numpy().ravel().tolist()

      running_loss += loss.item()

  test_loss = running_loss / len(data_loader.dataset)
  test_kev_loss = running_kev_loss / len(data_loader.dataset)

  res_df = pd.DataFrame(
      {
      'kev_true': list_kev_true,
      'kev_pred': list_kev_pred
       })
  
  return test_loss, res_df

### Models

In [None]:
def make_resnet18():
  model = models.resnet18(pretrained=False)
  model.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  model.fc = nn.Sequential(
      nn.Linear(in_features=512, out_features=128, bias=True),
      nn.Dropout(p=0.2),
      nn.ReLU(),
      nn.Linear(in_features=128, out_features=1))
  if torch.cuda.is_available():
      model.to(device)
  return model


def make_mobilenet_v2():
  model = models.mobilenet_v2(pretrained=False)
  model.features[0] = nn.Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
  model.classifier = nn.Sequential(
          nn.Linear(in_features=1280, out_features=128, bias=True),
          nn.Dropout(p=0.5),
          nn.ReLU(),
          nn.Linear(in_features=128, out_features=1))
  if torch.cuda.is_available():
      model.to(device)
  return model


def make_squeezenet1_0():
  model = models.squeezenet1_0(pretrained=False)
  model.features[0] = nn.Conv2d(1, 96, kernel_size=(7, 7), stride=(2, 2), bias=False)
  model.classifier = nn.Sequential(
        nn.Dropout(p=0.2),
        nn.Conv2d(512, 64, kernel_size=(1, 1), stride=(1, 1)),
        nn.ReLU(),
        nn.Flatten(),
        nn.Dropout(p=0.2),
        nn.Linear(in_features=64*15*15,out_features=512),
        nn.Dropout(p=0.5),
        nn.Linear(in_features=512,out_features=1))
  if torch.cuda.is_available():
      model.to(device)
  return model


def make_mobilenet_v3_small():
  model = models.mobilenet_v3_small(pretrained=False)
  model.features[0] = nn.Conv2d(1, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
  model.classifier = nn.Sequential(
      nn.Linear(in_features=576, out_features=128, bias=True),
      nn.Hardswish(),
      nn.Dropout(p=0.2),
      nn.Linear(in_features=128, out_features=1))
  if torch.cuda.is_available():
        model.to(device)
  return model


def make_resnext50_32x4d():
  model = models.resnext50_32x4d(pretrained=False)
  model.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  model.fc = nn.Sequential(
      nn.Linear(in_features=2048, out_features=128, bias=True),
      nn.Dropout(p=0.2),
      nn.ReLU(),
      nn.Linear(in_features=128, out_features=1))
  if torch.cuda.is_available():
    model.to(device)
  return model

### Functions

In [None]:
def pred_kev(model,data_loader):
  model.eval()
  list_class_true = []
  list_kev_pred = []
  list_kev_true = []
  with torch.no_grad():
    for X, (y_class, y_kev) in tqdm(data_loader,position=0):
      X = X.to(device)
      pred_kev = model(X)
      list_kev_true += y_kev.cpu().detach().numpy().ravel().tolist()
      list_kev_pred += pred_kev.cpu().detach().numpy().ravel().tolist()
      list_class_true += y_class.cpu().detach().numpy().ravel().tolist()
  res_df = pd.DataFrame(
      {
      'class_true': list_class_true,
      'kev_true' : list_kev_true,
      'kev_pred': list_kev_pred
       })
  return res_df


def train_model(model,train_loader,test_loader,fit,optimizer,criterion,path):
  nb_epochs = 25

  train_loss = []
  test_loss = []

  best_loss = 10**4
  nb_stag = 1
  for i in range(nb_epochs):
    tmp_train_loss = fit(
        model=model,
        criterion_kev=criterion,
        data_loader=train_loader,
        device='cuda',
        optimizer=optimizer
        )

    tmp_test_loss, _ = validate_kev(
        model=model,
        criterion_kev=criterion,
        data_loader=test_loader,
        device='cuda'
        )

    if tmp_test_loss < best_loss :
      nb_stag = 1
      best_loss = tmp_test_loss
      torch.save(model,f'{path}/best_model.pth')
      print(f'\nEpoch {i}/{nb_epochs}')
      print(f'Train : classif : {tmp_train_loss:.6f}')
      print(f'Test  : classif : {tmp_test_loss:.6f}')
    else :
      nb_stag += 1
    train_loss += [tmp_train_loss]
    test_loss += [tmp_test_loss]

    if nb_stag>=10 : 
      break
    
  model = torch.load(f'{path}/best_model.pth')
  df_res = pred_kev(model, test_loader)
  df_res.to_csv(f'{path}/df_res.csv', index=False)

  plt.figure(figsize=(15,10))
  plt.title('loss')
  plt.plot(train_loss,label='train')
  plt.plot(test_loss,label='test')
  plt.legend()
  plt.savefig(f'{path}/loss.png')


def cross_val_kev(model_maker,dict_splits,fit,criterion,path,rot,flip,gblur):
  if not os.path.exists(path):
    os.mkdir(path)
  for n_split in range(len(dict_splits)):
    print(f'\nCV split {n_split+1}/{len(dict_splits)}\n')

    training_set = Dataset(dict_splits[n_split]["train"], train=True, rot=rot, flip=flip, gblur=gblur)
    training_generator = torch.utils.data.DataLoader(training_set, **params)
    test_set = Dataset(dict_splits[n_split]["val"], train=False)
    test_generator = torch.utils.data.DataLoader(test_set, **params)

    model = model_maker()
    optimizer = optim.Adam(model.parameters(), lr=0.0001)
    cv_path = f'{path}/cv_{n_split}'
    if not os.path.exists(cv_path):
      os.mkdir(cv_path)
    train_model(
        model=model,
        train_loader=training_generator,
        test_loader=test_generator,
        fit=fit,
        optimizer=optimizer,
        criterion=criterion,
        path=cv_path)
  recap_kev_pred(path,True)


def recap_kev_pred(path, savefig=True):
  df_res = pd.DataFrame()
  for i in range(3):
    tmp_res = pd.read_csv(f'{path}/cv_{i}/df_res.csv')
    df_res = pd.concat([df_res,tmp_res])
  df_res.reset_index(drop=True,inplace=True)

  plt.figure(figsize=(10,7))
  for kev in sorted(df_res.kev_true.unique()):
    tmp_df = df_res.loc[df_res.kev_true==kev]
    plt.plot(tmp_df.kev_pred.values,'.',label=kev)
  plt.legend()
  mae = mean_absolute_error(df_res.kev_true.values, df_res.kev_pred.values)
  if savefig:
    plt.savefig(f'{path}/recap_kev_{mae:.4f}.png')
  else :
    plt.show()
  ### kmeans part 
  df_res.sort_values(by=['class_true'],inplace=True)
  tmp_0 = df_res.loc[df_res.class_true==0].copy()
  tmp_1 = df_res.loc[df_res.class_true==1].copy()
  kmeans_0 = KMeans(n_clusters=3, random_state=0).fit(tmp_0.kev_pred.values.reshape(-1,1))
  kmeans_1 = KMeans(n_clusters=3, random_state=0).fit(tmp_1.kev_pred.values.reshape(-1,1))
  tmp_0['cluster'] = kmeans_0.labels_
  tmp_1['cluster'] = kmeans_1.labels_
  replace_0 = cluster_to_kev(kmeans_0.cluster_centers_[:,0],0)
  replace_1 = cluster_to_kev(kmeans_1.cluster_centers_[:,0],1)
  tmp_0["cluster"].replace(replace_0, inplace=True)
  tmp_1["cluster"].replace(replace_1, inplace=True)
  df_res = pd.concat([tmp_0,tmp_1])
  mae = mean_absolute_error(df_res.kev_true.values, df_res.cluster.values)
  plt.figure(figsize=(15,10))
  for i,cluster in enumerate(sorted(df_res.cluster.unique())):
    plt.subplot(2,3,i+1)
    plt.title(f'target : {cluster}')
    tmp_df = df_res.loc[df_res.cluster==cluster]
    plt.hist(tmp_df.kev_true)
  if savefig:
    plt.savefig(f'{path}/recap_kev_kmeans_{mae:.4f}.png')
  else :
    plt.show()
  df_res.to_csv(f'{path}/df_res.csv', index=False)

def cluster_to_kev(clusters,pred_class):
  if pred_class == 0:
    list_kev = [1,6,20]
  else :
    list_kev = [3,10,30]

  clusters_rank = [sorted(clusters).index(x) for x in clusters]
  replace_dict = {
      0:list_kev[clusters_rank[0]],
      1:list_kev[clusters_rank[1]],
      2:list_kev[clusters_rank[2]]
      }
  return replace_dict

# Train

In [None]:
criterion_class = nn.L1Loss()

cross_val_kev(
    model_maker=make_resnet18,
    dict_splits=dict_splits4, 
    fit = fit_kev_l2,
    criterion=criterion_class,
    rot = True,
    flip = True,
    gblur = 2,
    path='/content/gdrive/MyDrive/IDAO/models/kev/resnet_l2reg_gblur2_split4'
    )
