In [1]:
from google import colab
colab.drive.mount('/content/gdrive')
import pandas as pd
import numpy as np
import torch
from glob import glob
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
import gc
import shutil
import tarfile
import os

from torchvision import io
from torchvision import datasets, transforms, models

from scipy.stats import hmean

Mounted at /content/gdrive


In [3]:
data_path = '/content/gdrive/MyDrive/IDAO'
models_path = '/content/gdrive/MyDrive/IDAO/clean_code/models'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

params = {
    'batch_size': 64,
    'shuffle': False,
    'num_workers':2
    }

# Load data

In [4]:
%%time
# Transfer data to the machine

shutil.copyfile(f'{data_path}/raw_data/track_1.tar', 'track_1.tar') 

my_tar = tarfile.open('track_1.tar')
my_tar.extractall('extract') # specify which folder to extract to
my_tar.close()

with zipfile.ZipFile(f'{data_path}/raw_data/track_1_private_test_final_27_03.zip', 'r') as zip_ref:
  zip_ref.extractall('extract_private2')
  
os.remove('track_1.tar')

CPU times: user 32.7 s, sys: 21.4 s, total: 54 s
Wall time: 4min 19s


# Torch dataset

In [6]:
class DatasetTest(torch.utils.data.Dataset):
  'Characterizes a dataset for PyTorch'
  def __init__(self, paths):
    'Initialization'
    self.paths = paths

  def __len__(self):
    'Denotes the total number of samples'
    return len(self.paths)

  def __getitem__(self, index):
    'Generates one sample of data'
    # Select sample
    ID = self.paths[index]
    # Load data and get label
    X = io.read_image(ID)
    X = transforms.ConvertImageDtype(torch.float32).forward(X)
    X = transforms.CenterCrop(256).forward(X)
    return X, ID.split('/')[-1].split('.')[0]

# Generators
public_set = DatasetTest(glob('/content/extract/idao_dataset/public_test/*'))
public_generator = torch.utils.data.DataLoader(public_set, **params)

private_set = DatasetTest(glob('/content/extract_private2/private_test/*'))
private_generator = torch.utils.data.DataLoader(private_set, **params)

## utils

In [7]:
def pred_test(model,data_loader):
  model.eval()
  list_pred = []
  list_id = []
  with torch.no_grad():
    for X,id in tqdm(data_loader,position=0):
      X = X.to(device)
      pred = model(X)
      list_pred += pred.cpu().detach().numpy().ravel().tolist()
      list_id += list(id)
  res_df = pd.DataFrame(
      {
      'id':list_id,
      'pred': list_pred
       })
  return res_df

# Class

In [8]:
class_models = glob(f'{models_path}/class/*')
df_res = pd.DataFrame()
for model_path in tqdm(class_models,position=0):
  model = torch.load(model_path)
  name = model_path.split('/')[-1][:-4]
  tmp_df = pred_test(model,private_generator)
  tmp_df.sort_values(by=['id'],inplace=True)
  df_res[f'{name}_pred'] =  tmp_df.pred

pred_mean = df_res.values.mean(axis=1)
pred_var = df_res.values.var(axis=1)
pred_hmean = hmean(df_res.values,axis=1)

df_res['pred_mean'] = pred_mean
df_res['pred_hmean'] = pred_hmean
df_res['pred_var'] = pred_var
df_res['id'] = tmp_df.id

df_res.to_csv('/content/gdrive/MyDrive/IDAO/clean_code/class_pred.csv', index=False)

100%|██████████| 236/236 [00:50<00:00,  4.71it/s]
100%|██████████| 236/236 [00:42<00:00,  5.52it/s]
100%|██████████| 236/236 [00:42<00:00,  5.56it/s]
100%|██████████| 236/236 [00:42<00:00,  5.59it/s]
100%|██████████| 236/236 [00:42<00:00,  5.60it/s]
100%|██████████| 236/236 [00:42<00:00,  5.56it/s]
100%|██████████| 236/236 [00:42<00:00,  5.58it/s]
100%|██████████| 236/236 [00:42<00:00,  5.61it/s]
100%|██████████| 236/236 [00:42<00:00,  5.59it/s]
100%|██████████| 236/236 [00:42<00:00,  5.58it/s]
100%|██████████| 236/236 [00:42<00:00,  5.62it/s]
100%|██████████| 236/236 [00:42<00:00,  5.59it/s]
100%|██████████| 236/236 [00:42<00:00,  5.60it/s]
100%|██████████| 236/236 [00:56<00:00,  4.16it/s]
100%|██████████| 236/236 [00:56<00:00,  4.15it/s]
100%|██████████| 236/236 [00:42<00:00,  5.58it/s]
100%|██████████| 236/236 [00:42<00:00,  5.58it/s]
100%|██████████| 236/236 [00:42<00:00,  5.58it/s]
100%|██████████| 236/236 [00:42<00:00,  5.57it/s]
100%|██████████| 236/236 [00:42<00:00,  5.58it/s]


# keV

In [9]:
class_models = glob(f'{models_path}/kev/*')
df_res = pd.DataFrame()
for model_path in tqdm(class_models,position=0):
  model = torch.load(model_path)
  name = model_path.split('/')[-1][:-4]
  tmp_df = pred_test(model,private_generator)
  tmp_df.sort_values(by=['id'],inplace=True)
  df_res[f'{name}_pred'] =  tmp_df.pred

pred_mean = df_res.values.mean(axis=1)
pred_var = df_res.values.var(axis=1)
pred_hmean = hmean(df_res.values,axis=1)

df_res['pred_mean'] = pred_mean
df_res['pred_hmean'] = pred_hmean
df_res['pred_var'] = pred_var
df_res['id'] = tmp_df.id

df_res.to_csv('/content/gdrive/MyDrive/IDAO/clean_code/kev_pred.csv', index=False)

100%|██████████| 236/236 [00:42<00:00,  5.53it/s]
100%|██████████| 236/236 [00:42<00:00,  5.56it/s]
100%|██████████| 236/236 [00:42<00:00,  5.60it/s]
100%|██████████| 236/236 [00:42<00:00,  5.54it/s]
100%|██████████| 236/236 [00:57<00:00,  4.13it/s]
100%|██████████| 236/236 [00:56<00:00,  4.15it/s]
100%|██████████| 236/236 [00:56<00:00,  4.15it/s]
100%|██████████| 236/236 [00:42<00:00,  5.56it/s]
100%|██████████| 236/236 [00:42<00:00,  5.57it/s]
100%|██████████| 236/236 [00:57<00:00,  4.13it/s]
100%|██████████| 236/236 [00:56<00:00,  4.15it/s]
100%|██████████| 236/236 [00:57<00:00,  4.12it/s]
100%|██████████| 236/236 [00:42<00:00,  5.59it/s]
100%|██████████| 236/236 [00:42<00:00,  5.55it/s]
100%|██████████| 236/236 [00:42<00:00,  5.54it/s]
100%|██████████| 15/15 [12:24<00:00, 49.62s/it]
