In [1]:
import pandas as pd 
import numpy as np 

from sklearn.model_selection import train_test_split, StratifiedKFold

from tqdm.notebook import tqdm

import torch

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import RobustScaler

import math

In [2]:
def top10_accuracy_scorer(gt_idx, top10_idx):

    aciertos = 0

    for arr, gt in zip(top10_idx,gt_idx):
        if gt in arr:
            aciertos+=1
            
    top_10_accuracy =  aciertos / len(gt_idx)
    return top_10_accuracy

## DATA

In [3]:
train = pd.read_csv('../data/processed/train.csv')
test = pd.read_csv('../data/processed/test.csv')
print('Train: ',train.shape)
print('Test: ',test.shape)

Train:  (67447, 43)
Test:  (18816, 42)


## TOKENIZER

In [5]:
from Bio.Seq import Seq

def dna_to_protein(sequences):
    seqs = []
    for seq in tqdm(sequences,total=len(sequences)):
        try:
            seqs.append(str(Seq(seq).translate()))
        except e:
            print(e)
            print(seq)
            print('error')
    return seqs

train_protein = dna_to_protein(train.sequence)
test_protein = dna_to_protein(test.sequence)

HBox(children=(FloatProgress(value=0.0, max=67447.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=18816.0), HTML(value='')))




In [6]:
filename = 'corpus.txt'

with open(filename,'w+') as f:
    for i in tqdm(range(len(train_protein)),total=len(train_protein)):
            f.write(train_protein[i])
            f.write('\n')
    for i in tqdm(range(len(test_protein)),total=len(test_protein)):
            f.write(test_protein[i])
            f.write('\n')

HBox(children=(FloatProgress(value=0.0, max=67447.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=18816.0), HTML(value='')))




In [7]:
%%time
from tokenizers import ByteLevelBPETokenizer, SentencePieceBPETokenizer
# Initialize a tokenizer
tokenizer = SentencePieceBPETokenizer()

# Customize training
tokenizer.train(files='corpus.txt', vocab_size=5000, min_frequency=1)

CPU times: user 27min 33s, sys: 12.9 s, total: 27min 46s
Wall time: 5min 16s


In [8]:
tokenizer.save_model('../data/features/bert/tok_especial/')

['../data/features/bert/tok_especial/vocab.json',
 '../data/features/bert/tok_especial/merges.txt']

In [14]:
from transformers import PreTrainedTokenizerFast

#tokenizer = PreTrainedTokenizerFast.from_pretrained("../data/features/bert/tok_especial/")
vocab_size = 5000

def get_seq_emb(sequences):

    df = []

    for seq in tqdm(sequences,total=len(sequences)):
        input_ids = tokenizer.encode(seq).ids
        emb = np.zeros(vocab_size,dtype=np.int16)
        for id_ in input_ids:
            emb[id_] += 1
        df.append(emb)
    
    df = pd.DataFrame(df,dtype=np.int16)
    print(df.shape)

    return df

In [4]:
train_emb = get_seq_emb(train_protein)
train_emb['sequence_id'] = train.sequence_id.values
test_emb = get_seq_emb(test_protein)
test_emb['sequence_id'] = test.sequence_id.values

NameError: name 'get_seq_emb' is not defined

In [16]:
%%time
scaler = RobustScaler()
df = pd.concat([train_emb,test_emb],axis=0)
scaler.fit(df.iloc[:,:-1])


df = pd.DataFrame(scaler.transform(train_emb.iloc[:,:-1]))
df['sequence_id'] = train_emb['sequence_id']
train_emb = df

df = pd.DataFrame(scaler.transform(test_emb.iloc[:,:-1]))
df['sequence_id'] = test_emb['sequence_id']
test_emb = df

CPU times: user 8.94 s, sys: 2.52 s, total: 11.5 s
Wall time: 11.5 s


In [17]:
train_emb.to_csv('../data/features/bert/tok_especial/train_emb.csv',index=False)
test_emb.to_csv('../data/features/bert/tok_especial/test_emb.csv',index=False)

In [4]:
train_emb = pd.read_csv('../data/features/bert/tok_especial/train_emb.csv')
test_emb = pd.read_csv('../data/features/bert/tok_especial/test_emb.csv')

## N-GRAMS

In [5]:
train_ngram_features = pd.read_csv('../data/features/ngram/5_ngram_train.csv')
test_ngram_features = pd.read_csv('../data/features/ngram/5_ngram_test.csv')

print('Train: ',train_ngram_features.shape)
print('Test: ',test_ngram_features.shape)

Train:  (67447, 3906)
Test:  (18816, 3906)


In [6]:
%%time
scaler = RobustScaler()
df = pd.concat([train_ngram_features,test_ngram_features],axis=0)
scaler.fit(df.iloc[:,1:])


train_ngram = pd.DataFrame(scaler.transform(train_ngram_features.iloc[:,1:]))
train_ngram['sequence_id'] = train_ngram_features['sequence_id']


test_ngram = pd.DataFrame(scaler.transform(test_ngram_features.iloc[:,1:]))
test_ngram['sequence_id'] = test_ngram_features['sequence_id']


Wall time: 22.2 s


## BLAST

In [7]:
path = '../data/features/blast/processed/train.csv'
train_blast = pd.read_csv(path)
print(train_blast.shape)

path = '../data/features/blast/processed/test.csv'
test_blast = pd.read_csv(path)
print(test_blast.shape)

(66739, 6571)
(18606, 6571)


In [8]:
%%time
scaler = RobustScaler()
df = pd.concat([train_blast,test_blast],axis=0)
scaler.fit(df.iloc[:,:-1])

df = pd.DataFrame(scaler.transform(train_blast.iloc[:,:-1]))
df['sequence_id'] = train_blast['sequence_id']
train_blast = df


df = pd.DataFrame(scaler.transform(test_blast.iloc[:,:-1]))
df['sequence_id'] = test_blast['sequence_id']
test_blast = df

Wall time: 1min 10s


In [9]:
print(train.shape)
print(test.shape)

print(train_blast.shape)
print(test_blast.shape)

print(train_emb.shape)
print(test_emb.shape)

print(train_ngram.shape)
print(test_ngram.shape)

(67447, 43)
(18816, 42)
(66739, 6571)
(18606, 6571)
(67447, 2501)
(18816, 2501)
(67447, 3906)
(18816, 3906)


In [10]:
train = pd.merge(train,train_blast,how='outer',on='sequence_id')
test = pd.merge(test,test_blast,how='outer',on='sequence_id')

train = pd.merge(train,train_emb,how='inner',on='sequence_id')
test = pd.merge(test,test_emb,how='inner',on='sequence_id')

train = pd.merge(train,train_ngram,how='inner',on='sequence_id')
test = pd.merge(test,test_ngram,how='inner',on='sequence_id')

train.fillna(0,inplace=True)
test.fillna(0,inplace=True)

print(train.shape)
print(test.shape)

del df, train_emb,test_emb, train_blast,test_blast, scaler, train_ngram_features, test_ngram_features

(67447, 13018)
(18816, 13017)


In [11]:
train.drop(['sequence','sequence_id'],inplace=True,axis=1)
test.drop(['sequence_id','sequence'],inplace=True,axis=1)

In [12]:
X = train.drop('target',inplace=False,axis=1)
y = train['target']

In [13]:
"""
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42,sampling_strategy='not majority')
X_res, y_res = ros.fit_resample(X, y)
print(X_res.shape)
"""
#K = 5
#skf = StratifiedKFold(n_splits=K,shuffle=True,random_state=420)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=420,stratify=y)

In [14]:
labs = pd.read_csv('../data/raw/train_labels.csv').columns[1:]

lab_pos = dict()
i = 0
for lab in labs:
    lab_pos[lab]=i
    i+=1

def get_targets(y):
    targets = []
    for lab in y:
        tmp = np.zeros(len(labs))
        idx = lab_pos[lab]
        tmp[idx] = 1
        targets.append(tmp)
    targets = torch.tensor(targets)
    print(targets.shape)
    return targets

In [15]:
%%time
tmp = [torch.from_numpy(arr).float() for arr in X_train.values]
train_dataset = torch.utils.data.TensorDataset(torch.stack(tmp),get_targets(y_train))
tmp = [torch.from_numpy(arr).float() for arr in X_valid.values]
valid_dataset = torch.utils.data.TensorDataset(torch.stack(tmp),get_targets(y_valid))
tmp = [torch.from_numpy(arr).float() for arr in test.values]
test_dataset = torch.utils.data.TensorDataset(torch.stack(tmp))

torch.Size([53957, 1314])
torch.Size([13490, 1314])
Wall time: 1min 53s


In [16]:
train_dataloader = torch.utils.data.DataLoader(
    dataset = train_dataset, 
    batch_size = 128, 
    sampler = torch.utils.data.RandomSampler(train_dataset)
)

In [17]:
valid_dataloader = torch.utils.data.DataLoader(
    dataset = valid_dataset, 
    batch_size = 128, 
    sampler = torch.utils.data.SequentialSampler(valid_dataset)
)

In [18]:
test_dataloader = torch.utils.data.DataLoader(
    dataset = test_dataset, 
    batch_size = 128, 
    sampler = torch.utils.data.SequentialSampler(test_dataset)
)

In [19]:
print(X_train.shape)
print(X_valid.shape)
print(test.shape)

print(y_train.shape)
print(y_valid.shape)

(53957, 13015)
(13490, 13015)
(18816, 13015)
(53957,)
(13490,)


In [20]:
%load_ext autoreload
%autoreload 2

from models.BlastEncoder import Blast
from models.TokEncoder import Tok
from models.NGramEncoder import NGram
from models.Encoder import Encoder


In [76]:
device = torch.device('cuda')
print(torch.cuda.get_device_name(0))
print(torch.cuda.get_device_properties(0))
model = NGram().to(device)
scaler = torch.cuda.amp.GradScaler()

Tesla V100-SXM2-16GB
_CudaDeviceProperties(name='Tesla V100-SXM2-16GB', major=7, minor=0, total_memory=16160MB, multi_processor_count=80)


In [70]:
epochs = 50

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

#scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,patience=25,mode='max')

folder = model.folder

In [71]:
print('TRAINING...')

training_stats = []

optimizer.zero_grad() 

with tqdm(total=epochs,leave=False) as pbar:
  for epoch_i in range(0, epochs):
    total_train_loss = 0
    model.train()

    for step, batch in tqdm(enumerate(train_dataloader),total=len(train_dataloader),leave=False):

        optimizer.zero_grad()

        with torch.cuda.amp.autocast(enabled=True):
          _, loss = model(batch,device)

        scaler.scale(loss).backward()
        total_train_loss += loss.item()
  
        scaler.step(optimizer)
        scaler.update()

    model.eval()

    total_dev_loss = 0
            
    for step, batch in enumerate(valid_dataloader):
    
        b_labels = batch[1]
        with torch.cuda.amp.autocast(enabled=True):
          with torch.no_grad():
            _, loss = model(batch,device)
      
        scaler.scale(loss)
        total_dev_loss += loss.item()

    avg_train_loss = total_train_loss/len(train_dataloader)
    avg_dev_loss = total_dev_loss/len(test_dataloader)
 
    training_stats.append(
        {
            'train_loss': avg_train_loss,
            'dev_loss': avg_dev_loss,
        }
      )
    
    torch.save(model.encoder,folder+str(epoch_i)+'.ckpt')
    
    pbar.update(1)

    print('\nEpoch: ',epoch_i,' train_loss ',avg_train_loss,
            ' dev_loss ',avg_dev_loss,
          )


TRAINING...


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  0  train_loss  0.3521835948671634  dev_loss  0.11494939693180072


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  1  train_loss  0.2533338158501762  dev_loss  0.039663489350453524


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  2  train_loss  0.226839512308521  dev_loss  0.06723202759919523


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  3  train_loss  0.20771620504257945  dev_loss  0.035492148250341415


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  4  train_loss  0.19063303688920646  dev_loss  0.03299721718138578


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  5  train_loss  0.17398829883079253  dev_loss  0.07279015142394572


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  6  train_loss  0.1621250379742322  dev_loss  0.022993338059912733


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  7  train_loss  0.14359596077688214  dev_loss  0.02349440446820389


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  8  train_loss  0.13569366517454667  dev_loss  0.024096862964180052


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  9  train_loss  0.12405313636613259  dev_loss  0.028657289703382927


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  10  train_loss  0.1390996795783252  dev_loss  0.05467450505971503


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  11  train_loss  0.11030829216296215  dev_loss  0.026930074809359855


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  12  train_loss  0.12092926663509901  dev_loss  0.02776731280800031


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  13  train_loss  0.09502594541069738  dev_loss  0.024230599352697127


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  14  train_loss  0.09058093835303993  dev_loss  0.02359189688652551


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  15  train_loss  0.08249820692473538  dev_loss  0.029875440991857426


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  16  train_loss  0.07648273465395751  dev_loss  0.02595010014618335


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  17  train_loss  0.07121913831336733  dev_loss  0.02420545713405828


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  18  train_loss  0.09153554268110703  dev_loss  0.1006230172149989


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  19  train_loss  0.0715065225164331  dev_loss  0.05509508474647593


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  20  train_loss  0.060434152747358756  dev_loss  0.022676106694401527


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  21  train_loss  0.05514720930845887  dev_loss  0.018570695642609984


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  22  train_loss  0.04788026902427394  dev_loss  0.025251537016561243


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  23  train_loss  0.0714615186173204  dev_loss  0.07320416374068682


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  24  train_loss  0.06316014662190778  dev_loss  0.03188860686305834


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  25  train_loss  0.04477638190047224  dev_loss  0.02346726688144564


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  26  train_loss  0.03941032764285591  dev_loss  0.017435650791035217


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  27  train_loss  0.03524385749084336  dev_loss  0.01868280302733183


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  28  train_loss  0.03729476773961319  dev_loss  0.015371198702578236


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  29  train_loss  0.03742476724644337  dev_loss  0.04147467542389015


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  30  train_loss  0.054671065745018954  dev_loss  0.07624710764305122


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  31  train_loss  0.04710037170322303  dev_loss  0.030193431017806336


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  32  train_loss  0.027785307061262605  dev_loss  0.046478490845686726


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  33  train_loss  0.026448110974795446  dev_loss  0.031567538125427806


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  34  train_loss  0.02528208554214776  dev_loss  0.03502037559262141


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  35  train_loss  0.027160949994019846  dev_loss  0.091986793137732


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  36  train_loss  0.0650002126343619  dev_loss  0.017544642249185616


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  37  train_loss  0.024894682511740245  dev_loss  0.04354796921010731


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  38  train_loss  0.029909387581774267  dev_loss  0.23469707805055137


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  39  train_loss  0.06132761137648287  dev_loss  0.02471254791329507


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  40  train_loss  0.025909632616521905  dev_loss  0.042263444905885225


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  41  train_loss  0.0214641346459352  dev_loss  0.018626098654099872


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  42  train_loss  0.021032290193318474  dev_loss  0.01616494735816912


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  43  train_loss  0.0216069162373047  dev_loss  0.01684973679077463


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  44  train_loss  0.02180993068583698  dev_loss  0.034130958067316586


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  45  train_loss  0.029561024907340794  dev_loss  0.021832491524618903


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  46  train_loss  0.025833746270492886  dev_loss  0.049485176396207746


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  47  train_loss  0.023551014494768815  dev_loss  0.022079499660148508


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  48  train_loss  0.019090089472496255  dev_loss  0.020154705727282837


HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))


Epoch:  49  train_loss  0.0187736402014138  dev_loss  0.016473532318562068


In [72]:
# Best epochs
# Blast 74
# Tok 124
# NGram 49
model

NGram(
  (encoder): NGramEncoder(
    (l1): Linear(in_features=3905, out_features=3000, bias=True)
    (b1): BatchNorm1d(3000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (l2): Linear(in_features=3000, out_features=1500, bias=True)
    (b2): BatchNorm1d(1500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (l3): Linear(in_features=1500, out_features=900, bias=True)
    (b3): BatchNorm1d(900, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (a): LeakyReLU(negative_slope=0.01)
    (l4): Linear(in_features=900, out_features=500, bias=True)
  )
  (decoder): NGramDecoder(
    (l1): Linear(in_features=500, out_features=900, bias=True)
    (b1): BatchNorm1d(900, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (l2): Linear(in_features=900, out_features=1500, bias=True)
    (b2): BatchNorm1d(1500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (l3): Linear(in_features=1500, out_features=3000,

In [21]:
device = torch.device('cuda')
model = Encoder(74,124,49).to(device)
model

Encoder(
  (blast): BlastEncoder(
    (c1): Conv1d(1, 2, kernel_size=(5,), stride=(5,))
    (bc1): BatchNorm1d(2628, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (l1): Linear(in_features=2628, out_features=1000, bias=True)
    (b1): BatchNorm1d(1000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (a1): LeakyReLU(negative_slope=0.01)
    (l2): Linear(in_features=1000, out_features=500, bias=True)
    (b2): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (a2): LeakyReLU(negative_slope=0.01)
    (l3): Linear(in_features=500, out_features=200, bias=True)
  )
  (tok): TokEncoder(
    (l1): Linear(in_features=5000, out_features=3000, bias=True)
    (b1): BatchNorm1d(3000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (l2): Linear(in_features=3000, out_features=1500, bias=True)
    (b2): BatchNorm1d(1500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (l3): Linear(in_fea

In [22]:
def getFeatures(dataloader):
    X = []
    y = []
    model.eval()
    for step, batch in tqdm(enumerate(dataloader),total=len(dataloader),leave=False):
        with torch.no_grad():
            with torch.cuda.amp.autocast(enabled=True):
                label = batch[1].detach().numpy()

                x = model(batch,device)

                X.extend(x)
                y.extend(label)
    
    return np.array(X), np.array(y)

def t_labels(y):
    tmp = []
    for l in y:
        tmp.append(labs[np.argmax(l)])
    return np.array(tmp)

In [23]:
X_train, y_train = getFeatures(train_dataloader)
X_valid, y_valid = getFeatures(valid_dataloader)

HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=106.0), HTML(value='')))

In [24]:
y_train = t_labels(y_train)
y_valid = t_labels(y_valid)

In [25]:
X_train = X_train.astype(np.float32)
X_valid = X_valid.astype(np.float32)

## CLS

In [26]:
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.linear_model import LogisticRegression

import xgboost as xgb

In [27]:
"""
X_train = X_train[0:1000]
y_train = y_train[0:1000]
"""
def top10_accuracy_scorer(estimator, X, y):

    probas = estimator.predict_proba(X)
    
    top10_idx = np.argpartition(probas, -10, axis=1)[:, -10:]
    
    top10_preds = estimator.classes_[top10_idx]

    mask = top10_preds == np.reshape(y,(y.shape[0],1))
    
    top_10_accuracy = mask.any(axis=1).mean()
 
    return top_10_accuracy

In [31]:
#model_cls = SVC(kernel = 'linear', C = 1)
#model_cls = LogisticRegression()
#model_cls = GaussianNB()
#model_cls = KNeighborsClassifier(n_neighbors = 40)
model_cls = RandomForestClassifier(n_jobs=10)
#model_cls = xgb.XGBClassifier(n_estimators=100,objective='multi:softprob',eval_metric="mlogloss",max_depth=10,tree_method='hist',gpu_id=0,verbosity=1,n_jobs=10,random_state=420)

model_cls.fit(X_train, y_train)

MemoryError: could not allocate 344457216 bytes

In [None]:
preds = model_cls.predict(X_train)

acc = accuracy_score(y_train,preds)
f1 = f1_score(y_train,preds,average='macro')

print('ACC: ',acc)
print('F1: ', f1)
print('top10: ',top10_accuracy_scorer(model_cls,X_train,y_train))

In [None]:
preds = model_cls.predict(X_valid)

acc = accuracy_score(y_valid,preds)
f1 = f1_score(y_valid,preds,average='macro')

print('ACC: ',acc)
print('F1: ', f1)
print('Top-10: ',top10_accuracy_scorer(model_cls,X_valid,y_valid))

## SUBMISSION

In [47]:
model = torch.load('../models/Sauron/25.ckpt').to(device)
model

Sauron(
  (blast_conv): Blast_conv(
    (c1): Conv1d(1, 2, kernel_size=(5,), stride=(5,))
    (br_c1): BatchNorm1d(2628, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (dp_c1): Dropout(p=0.2, inplace=False)
    (l1): Linear(in_features=2628, out_features=5256, bias=True)
    (br_l1): BatchNorm1d(5256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (ac_l1): LeakyReLU(negative_slope=0.01)
    (dp_l1): Dropout(p=0.2, inplace=False)
    (cls): Identity()
  )
  (tokenet): TokeNet(
    (l1): Linear(in_features=2500, out_features=5000, bias=True)
    (br_l1): BatchNorm1d(5000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (ac_l1): LeakyReLU(negative_slope=0.01)
    (dp_l1): Dropout(p=0.2, inplace=False)
    (cls): Identity()
  )
  (l1): Linear(in_features=10296, out_features=10296, bias=True)
  (br_l1): BatchNorm1d(10296, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (ac_l1): LeakyReLU(negative_slope=0.01)
  (

In [28]:
model.eval()

logits = []
           
for step, batch in enumerate(test_dataloader):
    
    with torch.cuda.amp.autocast():
        with torch.no_grad():
            b_logits = model(batch,device)
            logits.extend(b_logits.float())

In [29]:
probas = nn.Softmax(dim=1)(torch.stack(logits)).detach().numpy()
probas.shape

(18816, 1314)

In [30]:
submission_format = pd.read_csv('../data/raw/submission_format.csv', index_col='sequence_id')

In [31]:
assert submission_format.shape == probas.shape
assert (labs == submission_format.columns).all()

In [32]:
my_submission = pd.DataFrame(data=probas, 
                             columns=labs, 
                             index=submission_format.index)

In [33]:
my_submission.head()

Unnamed: 0_level_0,00Q4V31T,012VT4JK,028IO5W2,03GRNN7N,03Y3W51H,09MQV1TY,0A4AHRCT,0A9M05NC,0B9GCUVV,0CL7QVG8,...,ZQNGGY33,ZSHS4VJZ,ZT1IP3T6,ZU6860XU,ZU6TVFFU,ZU75P59K,ZUI6TDWV,ZWFD8OHC,ZX06ZDZN,ZZJVE4HO
sequence_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E0VFT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TTRK5,1.2191300000000001e-43,1.064852e-37,7.595038e-43,0.0,1.7795459999999998e-38,2.787869e-40,2.5931029999999997e-41,7.977755e-36,1.070592e-42,0.0,...,0.0,1.51346e-37,7.006492e-45,7.048812e-41,8.100313e-32,2.6867140000000002e-39,8.718819e-33,0.0,1.73761e-42,0.0
2Z7FZ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
VJI6E,1.00626e-16,1.217161e-17,3.480941e-17,4.6644629999999995e-19,3.7054420000000005e-17,3.763782e-16,7.163382000000001e-17,1.487721e-20,1.230674e-15,9.422474e-19,...,7.750715999999999e-19,1.940933e-18,7.632831000000001e-17,2.2917920000000003e-17,6.51694e-16,1.321216e-17,1.605646e-16,2.995351e-16,3.7711520000000004e-17,7.339326e-18
721FI,0.0,1.401298e-45,0.0,0.0,1.927066e-41,5.731311e-43,7.899047e-37,0.0,0.0,0.0,...,0.0,0.0,3.840455e-40,0.0,4.0637659999999996e-44,0.0,9.710158e-41,1.261169e-44,1.937996e-42,0.0


In [34]:
my_submission.to_csv('../submissions/submission.csv')