In [2]:
import sys

import collections
import gc
import json
import os
import random
import time
import warnings
warnings.simplefilter("ignore")

from albumentations import *
from albumentations.pytorch import ToTensor
import cv2
from imblearn.under_sampling import RandomUnderSampler
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image, ImageFilter
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import tifffile as tiff
import timm
import torch
import torch.backends.cudnn as cudnn
import torch.nn as nn
from torch.nn import functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset, sampler
from tqdm import tqdm_notebook as tqdm

%matplotlib inline


In [3]:
DATASET = "."
CROPED_DATA = "./"

TRAIN_CROPED_DATA = "croped_images_train/"
TEST_CROPED_DATA = "croped_images_test/"

In [4]:
BATCH_SIZE = 8
DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
EPOCHS = 500
NUM_WORKERS = 4
SEED = 2905

In [5]:
def set_seed(seed=2**3):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
set_seed(SEED)

In [6]:
df_croped_img_ids_train = pd.read_csv(CROPED_DATA + "croped_train.csv")
df_croped_img_ids_test = pd.read_csv(CROPED_DATA + "croped_test.csv")


FileNotFoundError: [Errno 2] No such file or directory: './croped_train.csv'

In [6]:
df_croped_img_ids_train.head()

Unnamed: 0,id,idx
0,905a3c8c-21bc-11ea-a13a-137349068a90,1
1,905a4416-21bc-11ea-a13a-137349068a90,1
2,905a4416-21bc-11ea-a13a-137349068a90,2
3,905a4416-21bc-11ea-a13a-137349068a90,3
4,905a579e-21bc-11ea-a13a-137349068a90,1


In [7]:
df_croped_img_ids_test.head()

Unnamed: 0,id,idx
0,915879a0-21bc-11ea-a13a-137349068a90,1
1,91588116-21bc-11ea-a13a-137349068a90,1
2,9158a2f4-21bc-11ea-a13a-137349068a90,1
3,9158aaa6-21bc-11ea-a13a-137349068a90,1
4,9158f1a0-21bc-11ea-a13a-137349068a90,1


In [8]:
with open('metadata/iwildcam2021_train_annotations.json', encoding='utf-8') as json_file:
    train_annotations =json.load(json_file)
df_train_annotation = pd.DataFrame(train_annotations["annotations"])

In [9]:
train = df_croped_img_ids_train[["id", "idx"]].merge(df_train_annotation[["image_id", "category_id"]], 
                                      left_on='id', right_on='image_id')[["id", "idx", "category_id"]]

In [10]:
df_categories = pd.DataFrame(train_annotations["categories"])

In [11]:
cat_idxs = df_categories["id"]

def convert_cat_to_index(x):
    return np.where(cat_idxs==x)[0][0]

In [12]:
train["category_id"] = train["category_id"].map(lambda x: convert_cat_to_index(x))

In [13]:
train.head()

Unnamed: 0,id,idx,category_id
0,905a3c8c-21bc-11ea-a13a-137349068a90,1,164
1,905a4416-21bc-11ea-a13a-137349068a90,1,39
2,905a4416-21bc-11ea-a13a-137349068a90,2,39
3,905a4416-21bc-11ea-a13a-137349068a90,3,39
4,905a579e-21bc-11ea-a13a-137349068a90,1,34


In [21]:
# ====================================================
# Dataset for train
# ====================================================

mean = np.array([0.37087523, 0.370876, 0.3708759] )
std = np.array([0.21022698, 0.21022713, 0.21022706])

def img2tensor(img,dtype:np.dtype=np.float32):
    if img.ndim==2 : img = np.expand_dims(img,2)
    img = np.transpose(img,(2,0,1))
    return torch.from_numpy(img.astype(dtype, copy=False))

class IWildcamTrainDataset(Dataset):
    def __init__(self, df, tfms=None):
        self.ids = df["id"]
        self.idxs = df["idx"]
        self.categories = df["category_id"]
        self.tfms = tfms
        
    def __len__(self):
        return len(self.ids)
    
    def __getitem__(self, idx):
        size = (256, 256)
        image_id = self.ids[idx]
        image_idx = self.idxs[idx]
        iamge_categorie = self.categories[idx]
        
        image_path = TRAIN_CROPED_DATA + f"{image_id}_{image_idx}.jpg"
        img = cv2.resize(cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB),size)

        if self.tfms is not None:
            augmented = self.tfms(image=img)
            img = augmented['image']
            
        # we should normalize here
        return img2tensor((img/255.0  - mean)/std), torch.tensor(iamge_categorie)

In [22]:
def get_aug(p=1.0):
    return Compose([
        HorizontalFlip(),
        ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.2, rotate_limit=15, p=0.9, 
                         border_mode=cv2.BORDER_REFLECT),
        RandomBrightnessContrast(p=0.9),
    ], p=p)

In [14]:
# ====================================================
# EfficientNet Model
# ====================================================

class enet_v2(nn.Module):
    def __init__(self, backbone, out_dim, pretrained=False):
        super(enet_v2, self).__init__()
        self.enet = timm.create_model(backbone, pretrained=pretrained)
        in_ch = self.enet.classifier.in_features
        self.myfc = nn.Linear(in_ch, out_dim)
        self.enet.classifier = nn.Identity()

    def forward(self, x):
        x = self.enet(x)
        x = self.myfc(x)
        return x

In [17]:
model = enet_v2(backbone="tf_efficientnet_b4", out_dim=205)
model.to(DEVICE)

enet_v2(
  (enet): EfficientNet(
    (conv_stem): Conv2dSame(3, 48, kernel_size=(3, 3), stride=(2, 2), bias=False)
    (bn1): BatchNorm2d(48, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    (act1): SwishMe()
    (blocks): Sequential(
      (0): Sequential(
        (0): DepthwiseSeparableConv(
          (conv_dw): Conv2d(48, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=48, bias=False)
          (bn1): BatchNorm2d(48, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
          (act1): SwishMe()
          (se): SqueezeExcite(
            (conv_reduce): Conv2d(48, 12, kernel_size=(1, 1), stride=(1, 1))
            (act1): SwishMe()
            (conv_expand): Conv2d(12, 48, kernel_size=(1, 1), stride=(1, 1))
          )
          (conv_pw): Conv2d(48, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn2): BatchNorm2d(24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
          (act2): Identity()
        )
     

In [18]:
class FocalLoss(nn.CrossEntropyLoss):
    ''' Focal loss for classification tasks on imbalanced datasets '''

    def __init__(self, gamma=2, alpha=None, ignore_index=-100, reduction='none'):
        super().__init__(weight=alpha, ignore_index=ignore_index, reduction='none')
        self.reduction = reduction
        self.gamma = gamma

    def forward(self, input_, target):
        cross_entropy = super().forward(input_, target)
        # Temporarily mask out ignore index to '0' for valid gather-indices input.
        # This won't contribute final loss as the cross_entropy contribution
        # for these would be zero.
        target = target * (target != self.ignore_index).long()
        input_prob = torch.gather(F.softmax(input_, 1), 1, target.unsqueeze(1))
        loss = torch.pow(1 - input_prob, self.gamma) * cross_entropy
        if self.reduction == 'mean':
            return torch.mean(loss) 
        if self.reduction == 'sum':
            return torch.sum(loss)
        return loss

In [19]:
# ====================================================
# Optimizer and Loss
# ====================================================

optimizer = torch.optim.Adam([{'params': model.parameters(), 'lr': 1e-4}])
criterion = nn.CrossEntropyLoss()
#criterion = FocalLoss()

In [25]:
rus = RandomUnderSampler(random_state=SEED, replacement=True)

def generate_dataloders(train):
    
    train_resampled, _ = rus.fit_resample(train, train["category_id"])
    test_resampled, _ = rus.fit_resample(train, train["category_id"])

    train_resampled = train_resampled.reset_index(drop=True)
    test_resampled = test_resampled.reset_index(drop=True)
    
    ds_train = IWildcamTrainDataset(train_resampled, tfms=get_aug())
    dl_train = DataLoader(ds_train,batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)
    ds_test = IWildcamTrainDataset(test_resampled)
    dl_test = DataLoader(ds_test,batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)
    
    return dl_train, dl_test

In [27]:
dl_train, dl_test = generate_dataloders(train)
print(len(dl_test))
    

25


In [21]:
# ====================================================
# Train
# ====================================================

for epoch in tqdm(range(EPOCHS)):
    
    dl_train, dl_test = generate_dataloders(train)
    
    ###Train
    model.train()
    train_loss = 0
    
    for data in dl_train:
        optimizer.zero_grad()
        imgs, categories = data
        imgs = imgs.to(DEVICE)
        categories = categories.to(DEVICE)
        
        outputs = model(imgs)
    
        loss = criterion(outputs, categories)
        loss.backward()
        optimizer.step()
            
        train_loss += loss.item()
    train_loss /= len(dl_train)
        
    print(f"EPOCH: {epoch + 1}, train_loss: {train_loss}")
        
    ###Validation
    model.eval()
    valid_loss = 0
        
    for data in dl_test:
        imgs, categories = data
        imgs = imgs.to(DEVICE)
        categories = categories.to(DEVICE)
        
        outputs = model(imgs)
    
        loss = criterion(outputs, categories)
        
        valid_loss += loss.item()
    valid_loss /= len(dl_test)
        
    print(f"EPOCH: {epoch + 1}, valid_loss: {valid_loss}")
        
    
    if (epoch+1)%10 == 0 or (epoch+1)%EPOCHS == 0:
        ###Save model
        torch.save(model.state_dict(), f"{epoch+1}_.pth")

  0%|          | 0/500 [00:00<?, ?it/s]

EPOCH: 1, train_loss: 5.465785369873047
EPOCH: 1, valid_loss: 5.32827127456665
EPOCH: 2, train_loss: 5.2529291152954105
EPOCH: 2, valid_loss: 6.934924697875976
EPOCH: 3, train_loss: 5.14696834564209
EPOCH: 3, valid_loss: 7.99636739730835
EPOCH: 4, train_loss: 5.1289448547363286
EPOCH: 4, valid_loss: 6.684405002593994
EPOCH: 5, train_loss: 5.072831745147705
EPOCH: 5, valid_loss: 8.831852741241455
EPOCH: 6, train_loss: 5.038208541870117
EPOCH: 6, valid_loss: 8.935380783081055
EPOCH: 7, train_loss: 4.999028472900391
EPOCH: 7, valid_loss: 8.922515449523926
EPOCH: 8, train_loss: 4.983216133117676
EPOCH: 8, valid_loss: 8.16773473739624
EPOCH: 9, train_loss: 4.889131145477295
EPOCH: 9, valid_loss: 9.660642929077149
EPOCH: 10, train_loss: 4.905545558929443
EPOCH: 10, valid_loss: 7.870201053619385
EPOCH: 11, train_loss: 4.912626991271972
EPOCH: 11, valid_loss: 7.123446779251099
EPOCH: 12, train_loss: 4.815604496002197
EPOCH: 12, valid_loss: 5.679875659942627
EPOCH: 13, train_loss: 4.77083776473

EPOCH: 101, train_loss: 1.5837412881851196
EPOCH: 101, valid_loss: 4.027040221691132
EPOCH: 102, train_loss: 1.5800615525245667
EPOCH: 102, valid_loss: 2.932650001049042
EPOCH: 103, train_loss: 1.5700786638259887
EPOCH: 103, valid_loss: 4.0996842646598814
EPOCH: 104, train_loss: 1.5042303895950317
EPOCH: 104, valid_loss: 2.771338171958923
EPOCH: 105, train_loss: 1.5865273976325989
EPOCH: 105, valid_loss: 1.6315734016895294
EPOCH: 106, train_loss: 1.721271150112152
EPOCH: 106, valid_loss: 1.8660434699058532
EPOCH: 107, train_loss: 1.375531129837036
EPOCH: 107, valid_loss: 4.980177097320556
EPOCH: 108, train_loss: 1.472763524055481
EPOCH: 108, valid_loss: 3.71516804933548
EPOCH: 109, train_loss: 1.448338668346405
EPOCH: 109, valid_loss: 0.941471803188324
EPOCH: 110, train_loss: 1.6988332748413086
EPOCH: 110, valid_loss: 1.1391001415252686
EPOCH: 111, train_loss: 1.3632848286628723
EPOCH: 111, valid_loss: 1.4873651194572448
EPOCH: 112, train_loss: 1.4370844841003418
EPOCH: 112, valid_loss

EPOCH: 196, valid_loss: 0.14460145711898803
EPOCH: 197, train_loss: 0.3545689606666565
EPOCH: 197, valid_loss: 0.3607261025905609
EPOCH: 198, train_loss: 0.4192909049987793
EPOCH: 198, valid_loss: 0.1904431614279747
EPOCH: 199, train_loss: 0.3697498023509979
EPOCH: 199, valid_loss: 0.2639857602119446
EPOCH: 200, train_loss: 0.3839750075340271
EPOCH: 200, valid_loss: 0.08588809072971344
EPOCH: 201, train_loss: 0.4294729280471802
EPOCH: 201, valid_loss: 0.24426518201828004
EPOCH: 202, train_loss: 0.3481598436832428
EPOCH: 202, valid_loss: 0.13333036601543427
EPOCH: 203, train_loss: 0.3635503804683685
EPOCH: 203, valid_loss: 0.14280646204948425
EPOCH: 204, train_loss: 0.3959704113006592
EPOCH: 204, valid_loss: 0.19389647305011748
EPOCH: 205, train_loss: 0.3331042557954788
EPOCH: 205, valid_loss: 0.0854733433574438
EPOCH: 206, train_loss: 0.33742415070533754
EPOCH: 206, valid_loss: 0.12842042922973632
EPOCH: 207, train_loss: 0.4420524835586548
EPOCH: 207, valid_loss: 0.3987130880355835
EPO

EPOCH: 290, valid_loss: 0.024377102851867675
EPOCH: 291, train_loss: 0.12956627666950227
EPOCH: 291, valid_loss: 0.010016180038219318
EPOCH: 292, train_loss: 0.14995874643325804
EPOCH: 292, valid_loss: 0.023521801470778883
EPOCH: 293, train_loss: 0.19142476558685304
EPOCH: 293, valid_loss: 0.022906687259674072
EPOCH: 294, train_loss: 0.16724820494651793
EPOCH: 294, valid_loss: 0.007362932681571692
EPOCH: 295, train_loss: 0.17611046224832536
EPOCH: 295, valid_loss: 0.02072048282716423
EPOCH: 296, train_loss: 0.1717783808708191
EPOCH: 296, valid_loss: 0.16883935123682023
EPOCH: 297, train_loss: 0.16785973191261291
EPOCH: 297, valid_loss: 0.046790443896315995
EPOCH: 298, train_loss: 0.1940903413295746
EPOCH: 298, valid_loss: 0.023709723234642297
EPOCH: 299, train_loss: 0.1806311798095703
EPOCH: 299, valid_loss: 0.05300142765045166
EPOCH: 300, train_loss: 0.22062334299087524
EPOCH: 300, valid_loss: 0.02636730670928955
EPOCH: 301, train_loss: 0.15825425267219542
EPOCH: 301, valid_loss: 0.01

EPOCH: 383, train_loss: 0.110782231092453
EPOCH: 383, valid_loss: 0.012082920968532562
EPOCH: 384, train_loss: 0.07294270187616349
EPOCH: 384, valid_loss: 0.0023175268154591324
EPOCH: 385, train_loss: 0.0694822347164154
EPOCH: 385, valid_loss: 0.0023333020228892567
EPOCH: 386, train_loss: 0.050184841603040695
EPOCH: 386, valid_loss: 0.0025948772579431536
EPOCH: 387, train_loss: 0.0991666655242443
EPOCH: 387, valid_loss: 0.009968434330075979
EPOCH: 388, train_loss: 0.11091322302818299
EPOCH: 388, valid_loss: 0.009662158936262131
EPOCH: 389, train_loss: 0.1178873485326767
EPOCH: 389, valid_loss: 0.003432631492614746
EPOCH: 390, train_loss: 0.12272691875696182
EPOCH: 390, valid_loss: 0.026000451087020336
EPOCH: 391, train_loss: 0.1284897130727768
EPOCH: 391, valid_loss: 0.0023385782237164677
EPOCH: 392, train_loss: 0.1000701642036438
EPOCH: 392, valid_loss: 0.012380802631378173
EPOCH: 393, train_loss: 0.13242362886667253
EPOCH: 393, valid_loss: 0.021968456264585257
EPOCH: 394, train_loss:

EPOCH: 476, train_loss: 0.04383339360356331
EPOCH: 476, valid_loss: 0.0068394351005554195
EPOCH: 477, train_loss: 0.036338023692369464
EPOCH: 477, valid_loss: 0.010123303413274698
EPOCH: 478, train_loss: 0.06416629403829574
EPOCH: 478, valid_loss: 0.008571488857269287
EPOCH: 479, train_loss: 0.07514016449451447
EPOCH: 479, valid_loss: 0.03720442581223324
EPOCH: 480, train_loss: 0.05217307329177857
EPOCH: 480, valid_loss: 0.005560631277039647
EPOCH: 481, train_loss: 0.038653072416782376
EPOCH: 481, valid_loss: 0.002072575569618493
EPOCH: 482, train_loss: 0.07621869921684266
EPOCH: 482, valid_loss: 0.016928979873191564
EPOCH: 483, train_loss: 0.08631548196077347
EPOCH: 483, valid_loss: 0.010933747291564941
EPOCH: 484, train_loss: 0.07436389803886413
EPOCH: 484, valid_loss: 0.007444846868747845
EPOCH: 485, train_loss: 0.1010375414043665
EPOCH: 485, valid_loss: 0.01126612234103959
EPOCH: 486, train_loss: 0.061192856803536415
EPOCH: 486, valid_loss: 0.02029255914676469
EPOCH: 487, train_los

In [15]:
# ====================================================
# Dataset for test
# ====================================================

mean = np.array([0.37087523, 0.370876, 0.3708759] )
std = np.array([0.21022698, 0.21022713, 0.21022706])

class IWildcamTestDataset(Dataset):
    def __init__(self, df, tfms=None):
        self.ids = df["id"]
        self.idx = df["idx"]
        self.tfms = tfms
        
    def __len__(self):
        return len(self.ids)
    
    def __getitem__(self, idx):
        size = (256, 256)
        image_id = self.ids[idx]
        image_idx = self.idx[idx]
        
        image_path = TEST_CROPED_DATA + f"{image_id}_{image_idx}.jpg"
        
        img = cv2.resize(cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB),size)

        if self.tfms is not None:
            augmented = self.tfms(image=img)
            img = augmented['image']
            
        # we should normalize here
        return img2tensor((img/255.0 - mean)/std), image_id

In [16]:
ds_test = IWildcamTestDataset(df_croped_img_ids_test)
dl_test = DataLoader(ds_test,batch_size=32,shuffle=False,num_workers=NUM_WORKERS)

In [17]:
model = enet_v2(backbone="tf_efficientnet_b4", out_dim=205)
model.to(DEVICE)
model.load_state_dict(torch.load("averaged.pth"))
model.eval()

RuntimeError: CUDA error: out of memory

In [97]:
pred_categories = []
pred_img_ids = []

In [98]:
with torch.no_grad():
    for imgs, img_ids in tqdm(dl_test):
        imgs = imgs.to(DEVICE)
        
        outputs = model(imgs)
        output_labels = torch.argmax(outputs, dim=1).tolist()
        pred_categories += output_labels
        pred_img_ids += img_ids

  0%|          | 0/1008 [00:00<?, ?it/s]

In [1]:
pred = collections.defaultdict(list)
for category, img_id in zip(pred_categories, pred_img_ids):
    pred[img_id].append(category)

NameError: name 'collections' is not defined

In [100]:
sub = pd.read_csv("sample_submission.csv")
col_Predicted = [col for col in sub.columns if "Predicted" in col]

In [101]:
with open('metadata/iwildcam2021_train_annotations.json', encoding='utf-8') as json_file:
    train_annotations =json.load(json_file)
df_categories = pd.DataFrame.from_records(train_annotations["categories"])

In [102]:
results = []

for key in pred.keys():
    c = collections.Counter(pred[key])
    
    res = []
    cnts = [ 0 for i in range(205)]
    for category, cnt in c.items():
        cnts[category] = cnt
    res += [key] + cnts[1:]
    results.append(res)

In [103]:
sub_tmp = pd.DataFrame(results, columns=sub.columns)

In [104]:
sub_tmp.head()

Unnamed: 0,Id,Predicted2,Predicted3,Predicted4,Predicted6,Predicted7,Predicted8,Predicted9,Predicted10,Predicted12,...,Predicted559,Predicted562,Predicted563,Predicted564,Predicted565,Predicted566,Predicted567,Predicted568,Predicted570,Predicted571
0,915879a0-21bc-11ea-a13a-137349068a90,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,91588116-21bc-11ea-a13a-137349068a90,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,9158a2f4-21bc-11ea-a13a-137349068a90,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9158aaa6-21bc-11ea-a13a-137349068a90,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9158f1a0-21bc-11ea-a13a-137349068a90,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [105]:
sub_tmp.to_csv("./sub_tmp.csv", index=False)

In [106]:
with open('metadata/iwildcam2021_test_information.json', encoding='utf-8') as json_file:
    test_information =json.load(json_file)
    
df_test_info = pd.DataFrame(test_information["images"])[["id", "seq_id"]]
df_test_info.head()

Unnamed: 0,id,seq_id
0,8b31d3be-21bc-11ea-a13a-137349068a90,a91ebc18-0cd3-11eb-bed1-0242ac1c0002
1,8cf202be-21bc-11ea-a13a-137349068a90,a91ebc18-0cd3-11eb-bed1-0242ac1c0002
2,8a87e62e-21bc-11ea-a13a-137349068a90,a91ebc18-0cd3-11eb-bed1-0242ac1c0002
3,8e6994f4-21bc-11ea-a13a-137349068a90,a91ebc18-0cd3-11eb-bed1-0242ac1c0002
4,948b29e2-21bc-11ea-a13a-137349068a90,a91ebc18-0cd3-11eb-bed1-0242ac1c0002


In [107]:
sub_tmp = sub_tmp.merge(df_test_info, left_on="Id", right_on="id", how="right")

In [108]:
sub_tmp.head()

Unnamed: 0,Id,Predicted2,Predicted3,Predicted4,Predicted6,Predicted7,Predicted8,Predicted9,Predicted10,Predicted12,...,Predicted563,Predicted564,Predicted565,Predicted566,Predicted567,Predicted568,Predicted570,Predicted571,id,seq_id
0,8b31d3be-21bc-11ea-a13a-137349068a90,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8b31d3be-21bc-11ea-a13a-137349068a90,a91ebc18-0cd3-11eb-bed1-0242ac1c0002
1,8cf202be-21bc-11ea-a13a-137349068a90,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8cf202be-21bc-11ea-a13a-137349068a90,a91ebc18-0cd3-11eb-bed1-0242ac1c0002
2,8a87e62e-21bc-11ea-a13a-137349068a90,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8a87e62e-21bc-11ea-a13a-137349068a90,a91ebc18-0cd3-11eb-bed1-0242ac1c0002
3,8e6994f4-21bc-11ea-a13a-137349068a90,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8e6994f4-21bc-11ea-a13a-137349068a90,a91ebc18-0cd3-11eb-bed1-0242ac1c0002
4,948b29e2-21bc-11ea-a13a-137349068a90,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,948b29e2-21bc-11ea-a13a-137349068a90,a91ebc18-0cd3-11eb-bed1-0242ac1c0002


In [109]:
sum_counts = []
for i in range(len(sub_tmp)):
    sum_counts.append(sum(sub_tmp.iloc[i][col_Predicted]))

In [110]:
sub_tmp["total"] =  sum_counts
sub_tmp = sub_tmp.sort_values('total', ascending=True)

In [111]:
#sub_tmp2 = sub_tmp.groupby('seq_id').median().reset_index().fillna("0")
sub_tmp2 = sub_tmp[~sub_tmp.duplicated(keep='first', subset='seq_id')].fillna("0")

In [112]:
sub_tmp2.head()

Unnamed: 0,Id,Predicted2,Predicted3,Predicted4,Predicted6,Predicted7,Predicted8,Predicted9,Predicted10,Predicted12,...,Predicted564,Predicted565,Predicted566,Predicted567,Predicted568,Predicted570,Predicted571,id,seq_id,total
49780,8e7b6e04-21bc-11ea-a13a-137349068a90,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,8e7b6e04-21bc-11ea-a13a-137349068a90,9066e2ca-21bc-11ea-a13a-137349068a90,0
24343,8ebd364a-21bc-11ea-a13a-137349068a90,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,8ebd364a-21bc-11ea-a13a-137349068a90,a923daa4-0cd3-11eb-bed1-0242ac1c0002,0
35943,874521ca-21bc-11ea-a13a-137349068a90,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,874521ca-21bc-11ea-a13a-137349068a90,a91b460a-0cd3-11eb-bed1-0242ac1c0002,0
25955,92fabef8-21bc-11ea-a13a-137349068a90,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,92fabef8-21bc-11ea-a13a-137349068a90,97d3369e-21bc-11ea-a13a-137349068a90,0
15998,90d09602-21bc-11ea-a13a-137349068a90,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,90d09602-21bc-11ea-a13a-137349068a90,a9258eee-0cd3-11eb-bed1-0242ac1c0002,0


In [113]:
sub = sub.reset_index()
sub = sub[["index", "Id"]].merge(sub_tmp2, left_on="Id", right_on="seq_id")

In [114]:
sub = sub.drop(labels=['index','total'], axis=1)

In [115]:
sub = sub.drop(labels=['Id'], axis=1)

KeyError: "['Id'] not found in axis"

In [116]:
sub = sub[["Id_x"] + col_Predicted].rename(columns={"Id_x": "Id"})
sub = sub.rename(columns={ "seq_id" : "Id"})
sub.to_csv("sub.csv", index=False)

In [117]:
sub.head()

Unnamed: 0,Id,Predicted2,Predicted3,Predicted4,Predicted6,Predicted7,Predicted8,Predicted9,Predicted10,Predicted12,...,Predicted559,Predicted562,Predicted563,Predicted564,Predicted565,Predicted566,Predicted567,Predicted568,Predicted570,Predicted571
0,32ce8026-7ec9-11eb-b675-4f3cc0c82eb3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,945c6602-21bc-11ea-a13a-137349068a90,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,a91c7e26-0cd3-11eb-bed1-0242ac1c0002,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9926239e-21bc-11ea-a13a-137349068a90,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,9672184c-21bc-11ea-a13a-137349068a90,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
