In [1]:
import pandas as pd 
import numpy as np 

from sklearn.model_selection import train_test_split, StratifiedKFold

from tqdm.notebook import tqdm

import torch

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import RobustScaler

import math

In [2]:
#col_names = ['train_loss','train_acc','train_top10','dev_loss', 'dev_acc','dev_top10']
def make_plot(training_stats):

    sns.set(style='darkgrid')
    sns.set(font_scale=1.5)

    plt.close()

    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.suptitle('Training stats')

    fig.set_size_inches(25, 10)

    ax1.plot(training_stats['train_loss'], 'b-o', label='training')
    ax1.plot(training_stats['dev_loss'], 'b-o', label='validation')

    ax1.set_title("Loss")
    ax1.set_xlabel('Epochs')
    ax1.set_ylabel('Loss') 
    ax1.legend()

    ax2.plot(training_stats['train_top10'], 'b-o', label='training')
    ax2.plot(training_stats['dev_top10'], 'b-o', label='validation')

    ax2.set_title("Top 10 Acc")
    ax2.set_xlabel('Epochs')
    ax2.set_ylabel('Top 10 Acc')
    ax2.legend()

    plt.show() 

In [3]:
def top10_accuracy_scorer(gt_idx, top10_idx):

    aciertos = 0

    for arr, gt in zip(top10_idx,gt_idx):
        if gt in arr:
            aciertos+=1
            
    top_10_accuracy =  aciertos / len(gt_idx)
    return top_10_accuracy

## DATA

In [4]:
train = pd.read_csv('../data/processed/train.csv')
test = pd.read_csv('../data/processed/test.csv')
print('Train: ',train.shape)
print('Test: ',test.shape)

Train:  (67447, 43)
Test:  (18816, 42)


## TOKENIZER

In [7]:
from Bio.Seq import Seq

def dna_to_protein(sequences):
    seqs = []
    for seq in tqdm(sequences,total=len(sequences)):
        try:
            seqs.append(str(Seq(seq).translate()))
        except e:
            print(e)
            print(seq)
            print('error')
    return seqs

train_protein = dna_to_protein(train.sequence)
test_protein = dna_to_protein(test.sequence)

HBox(children=(FloatProgress(value=0.0, max=67447.0), HTML(value='')))




NameError: name 'e' is not defined

In [None]:
filename = 'corpus.txt'

with open(filename,'w+') as f:
    for i in tqdm(range(len(train_protein)),total=len(train_protein)):
            f.write(train_protein[i])
            f.write('\n')
    for i in tqdm(range(len(test_protein)),total=len(test_protein)):
            f.write(test_protein[i])
            f.write('\n')

In [6]:
%%time
from tokenizers import ByteLevelBPETokenizer, SentencePieceBPETokenizer
# Initialize a tokenizer
tokenizer = SentencePieceBPETokenizer()

# Customize training
tokenizer.train(files='corpus.txt', vocab_size=5000, min_frequency=1)

In [9]:
tokenizer.save_model('../data/features/bert/tok_especial/')

['../data/features/bert/tok_especial/vocab.json',
 '../data/features/bert/tok_especial/merges.txt']

In [18]:
from transformers import PreTrainedTokenizerFast

tokenizer = SentencePieceBPETokenizer.from_pretrained("../data/features/bert/tok/")
vocab_size = 2500

def get_seq_emb(sequences):

    df = []

    for seq in tqdm(sequences,total=len(sequences)):
        input_ids = tokenizer.encode(seq).ids
        emb = np.zeros(vocab_size,dtype=np.int16)
        for id_ in input_ids:
            emb[id_] += 1
        df.append(emb)
    
    df = pd.DataFrame(df,dtype=np.int16)
    print(df.shape)

    return df

In [19]:
train_emb = get_seq_emb(train_protein)
train_emb['sequence_id'] = train.sequence_id.values
test_emb = get_seq_emb(test_protein)
test_emb['sequence_id'] = test.sequence_id.values

HBox(children=(FloatProgress(value=0.0, max=67447.0), HTML(value='')))


(67447, 2500)


HBox(children=(FloatProgress(value=0.0, max=18816.0), HTML(value='')))


(18816, 2500)


In [20]:
%%time
scaler = RobustScaler()
df = pd.concat([train_emb,test_emb],axis=0)
scaler.fit(df.iloc[:,:-1])


df = pd.DataFrame(scaler.transform(train_emb.iloc[:,:-1]))
df['sequence_id'] = train_emb['sequence_id']
train_emb = df

df = pd.DataFrame(scaler.transform(test_emb.iloc[:,:-1]))
df['sequence_id'] = test_emb['sequence_id']
test_emb = df

Wall time: 10 s


In [21]:
train_emb.to_csv('../data/features/bert/tok_especial/train_emb.csv',index=False)
test_emb.to_csv('../data/features/bert/tok_especial/test_emb.csv',index=False)

In [5]:
train_emb = pd.read_csv('../data/features/bert/tok_especial/train_emb.csv')
test_emb = pd.read_csv('../data/features/bert/tok_especial/test_emb.csv')

## N-GRAMS

train_ngram_features = pd.read_csv('../data/features/ngram/5_ngram_train.csv')
test_ngram_features = pd.read_csv('../data/features/ngram/5_ngram_test.csv')

print('Train: ',train_ngram_features.shape)
print('Test: ',test_ngram_features.shape)

%%time
scaler = RobustScaler()
df = pd.concat([train_ngram_features,test_ngram_features],axis=0)
scaler.fit(df.iloc[:,1:])


df = pd.DataFrame(scaler.transform(train_ngram_features.iloc[:,1:]))
df['sequence_id'] = train_ngram_features['sequence_id']
print(df.shape)
train = pd.merge(train,df,how='left',on='sequence_id')


df = pd.DataFrame(scaler.transform(test_ngram_features.iloc[:,1:]))
df['sequence_id'] = test_ngram_features['sequence_id']
print(df.shape)
test = pd.merge(test,df,how='left',on='sequence_id')

## BLAST

In [6]:
path = '../data/features/blast/processed/train.csv'
train_blast = pd.read_csv(path)
print(train_blast.shape)

path = '../data/features/blast/processed/test.csv'
test_blast = pd.read_csv(path)
print(test_blast.shape)

(66739, 6571)
(18606, 6571)


In [7]:
%%time
scaler = RobustScaler()
df = pd.concat([train_blast,test_blast],axis=0)
scaler.fit(df.iloc[:,:-1])

df = pd.DataFrame(scaler.transform(train_blast.iloc[:,:-1]))
df['sequence_id'] = train_blast['sequence_id']
train_blast = df


df = pd.DataFrame(scaler.transform(test_blast.iloc[:,:-1]))
df['sequence_id'] = test_blast['sequence_id']
test_blast = df

Wall time: 1min 1s


In [8]:
print(train.shape)
print(test.shape)

print(train_blast.shape)
print(test_blast.shape)

print(train_emb.shape)
print(test_emb.shape)

(67447, 43)
(18816, 42)
(66739, 6571)
(18606, 6571)
(67447, 2501)
(18816, 2501)


In [9]:
train = pd.merge(train,train_blast,how='outer',on='sequence_id')
test = pd.merge(test,test_blast,how='outer',on='sequence_id')

train = pd.merge(train,train_emb,how='inner',on='sequence_id')
test = pd.merge(test,test_emb,how='inner',on='sequence_id')

train.fillna(0,inplace=True)
test.fillna(0,inplace=True)

print(train.shape)
print(test.shape)

#del df, train_emb,test_emb, train_blast,test_blast, tokenizer, scaler

(67447, 9113)
(18816, 9112)


In [10]:
train.drop(['sequence','sequence_id'],inplace=True,axis=1)
test.drop(['sequence_id','sequence'],inplace=True,axis=1)

In [11]:
X = train.drop('target',inplace=False,axis=1)
y = train['target']

In [12]:
"""
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42,sampling_strategy='not majority')
X_res, y_res = ros.fit_resample(X, y)
print(X_res.shape)
"""
#K = 5
#skf = StratifiedKFold(n_splits=K,shuffle=True,random_state=420)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=420,stratify=y)

## INCISO

In [16]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [17]:
def top10_accuracy_scorer(estimator, X, y):

    probas = estimator.predict_proba(X)
    
    top10_idx = np.argpartition(probas, -10, axis=1)[:, -10:]
    
    top10_preds = estimator.classes_[top10_idx]

    mask = top10_preds == np.reshape(np.array(y.values.ravel()),(y.shape[0],1))
    
    top_10_accuracy = mask.any(axis=1).mean()
 
    return top_10_accuracy

In [12]:
K = 5
skf = StratifiedKFold(n_splits=K,shuffle=True,random_state=420)

In [13]:
test_preds = []

for i, (train_index, dev_index) in tqdm(enumerate(skf.split(X, y)),total=K):
    print('\n--------FOLD ',i+1)
    X_t, X_d = X.iloc[train_index], X.iloc[dev_index]
    y_t, y_d = y[train_index], y[dev_index]

    #model = xgb.XGBClassifier(n_estimators=2,objective='multi:softprob',eval_metric="mlogloss",max_depth=3,tree_method='hist',gpu_id=0,verbosity=1,n_jobs=10,random_state=420)
    model = SVC(class_weight='balanced', probability=True)
    #model = RandomForestClassifier(n_estimators=300,max_depth=20,verbose=0,n_jobs=11,random_state=420,max_features=None)
    """
    model = lightgbm.LGBMClassifier(
    objective='multiclass',
    boosting='dart',
    #learning_rate = 0.1,
    #max_depth = 20,
    n_jobs=-2,
    silent=True,
    random_state=420,
    #num_leaves = 400,
    #n_estimators = 400,
    #bagging_fraction = 0.8,
    #feature_fraction = 0.9
    )
    """

    model.fit(X_t, y_t)

    preds = model.predict(X_d)

    acc = accuracy_score(y_d,preds)
    f1 = f1_score(y_d,preds,average='macro')
    top = top10_accuracy_scorer(model, X_d, y_d)

    print('ACC: ',acc)
    print('F1: ', f1)
    print('TOP-10: ',top)

    test_preds.append(model.predict_proba(test))

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


--------FOLD  1

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

Traceback (most recent call last):
  File "c:\repos\GeneticEngineeringAttributionChallenge\env\lib\site-packages\IPython\core\interactiveshell.py", line 3417, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-13-3c9db420fb90>", line 26, in <module>
    model.fit(X_t, y_t)
  File "c:\repos\GeneticEngineeringAttributionChallenge\env\lib\site-packages\sklearn\ensemble\_forest.py", line 386, in fit
    trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
  File "c:\repos\GeneticEngineeringAttributionChallenge\env\lib\site-packages\joblib\parallel.py", line 1042, in __call__
    self.retrieve()
  File "c:\repos\GeneticEngineeringAttributionChallenge\env\lib\site-packages\joblib\parallel.py", line 921, in retrieve
    sel

## INCISO

In [13]:
labs = pd.read_csv('../data/raw/train_labels.csv').columns[1:]

lab_pos = dict()
i = 0
for lab in labs:
    lab_pos[lab]=i
    i+=1

def get_targets(y):
    targets = []
    for lab in y:
        tmp = np.zeros(len(labs))
        idx = lab_pos[lab]
        tmp[idx] = 1
        targets.append(tmp)
    targets = torch.tensor(targets)
    print(targets.shape)
    return targets

In [14]:
%%time
tmp = [torch.from_numpy(arr).float() for arr in X_train.values]
train_dataset = torch.utils.data.TensorDataset(torch.stack(tmp),get_targets(y_train))
tmp = [torch.from_numpy(arr).float() for arr in X_valid.values]
valid_dataset = torch.utils.data.TensorDataset(torch.stack(tmp),get_targets(y_valid))
tmp = [torch.from_numpy(arr).float() for arr in test.values]
test_dataset = torch.utils.data.TensorDataset(torch.stack(tmp))

torch.Size([60702, 1314])
torch.Size([6745, 1314])
Wall time: 54.5 s


In [21]:
train_dataloader = torch.utils.data.DataLoader(
    dataset = train_dataset, 
    batch_size = 8, 
    sampler = torch.utils.data.RandomSampler(train_dataset)
)

In [22]:
valid_dataloader = torch.utils.data.DataLoader(
    dataset = valid_dataset, 
    batch_size = 8, 
    sampler = torch.utils.data.SequentialSampler(valid_dataset)
)

In [23]:
test_dataloader = torch.utils.data.DataLoader(
    dataset = test_dataset, 
    batch_size = 8, 
    sampler = torch.utils.data.SequentialSampler(test_dataset)
)

In [18]:
print(X_train.shape)
print(X_valid.shape)
print(test.shape)

print(y_train.shape)
print(y_valid.shape)

(60702, 9110)
(6745, 9110)
(18816, 9110)
(60702,)
(6745,)


In [27]:
import torch.nn as nn
"""
minus target, sequence and sequence_id
(67447, 43)40
(18816, 42)40
(66739, 6571)6570
(18606, 6571)6570
(67447, 2501)2500
(18816, 2501)2500
"""
class Blast_conv(nn.Module):

    def __init__(self):
        super().__init__()

        self.folder = '../models/Blast_conv/'

        self.input_shape = 6570
        self.out_shape = 1314

        self.stride = 5
        self.c1_out = 2
        self.num_feats = 5
        self.idx_start_blast = 40
        self.idx_end_blast = self.idx_start_blast + self.input_shape

        self.hidden_size = self.c1_out*self.out_shape
        self.intermediate_size = self.hidden_size * 2

        self.c1 = torch.nn.Conv1d(1, self.c1_out, self.num_feats, self.stride, padding = 0)
        self.br_c1 = nn.BatchNorm1d(self.hidden_size)
        self.dp_c1 = nn.Dropout(0.3)
 
        self.l1 = nn.Linear(self.hidden_size,self.intermediate_size)
        self.br_l1 = nn.BatchNorm1d(self.intermediate_size)
        self.ac_l1 = nn.LeakyReLU()
        self.dp_l1 = nn.Dropout(0.3)
   
        self.cls = nn.Linear(self.intermediate_size,self.out_shape)
        
    def forward(self,batch, device):

        inputs = batch[0]
        inputs = inputs[:,self.idx_start_blast:self.idx_end_blast].unsqueeze(1).to(device)

        x = self.c1(inputs).flatten(1)
        x = self.br_c1(x)
        x = self.dp_c1(x)

        x = self.l1(x)
        x = self.br_l1(x)
        x = self.ac_l1(x)
        x = self.dp_l1(x)

        x = self.cls(x)
        
        return x.cpu().float()

class TokeNet(nn.Module):

    def __init__(self):

        super().__init__()

        self.folder = '../models/TokeNet/'

        self.input_shape = 2500
        self.out_shape = 1314

        self.idx_start_tokens = 6570 + 40
        self.idx_end_tokens = self.idx_start_tokens + self.input_shape

        self.hidden_size = self.input_shape * 2
 
        self.l1 = nn.Linear(self.input_shape,self.hidden_size)
        self.br_l1 = nn.BatchNorm1d(self.hidden_size)
        self.ac_l1 = nn.LeakyReLU()
        self.dp_l1 = nn.Dropout(0.3)
        
        self.l2 = nn.Linear(self.hidden_size,self.hidden_size)
        self.br_l2 = nn.BatchNorm1d(self.hidden_size)
        self.ac_l2 = nn.LeakyReLU()
        self.dp_l2 = nn.Dropout(0.3)
        
        self.l3 = nn.Linear(self.hidden_size,self.hidden_size)
        self.br_l3 = nn.BatchNorm1d(self.hidden_size)
        self.ac_l3 = nn.LeakyReLU()
        self.dp_l3 = nn.Dropout(0.3)
        """
        self.l4 = nn.Linear(self.hidden_size,self.hidden_size)
        self.br_l4 = nn.BatchNorm1d(self.hidden_size)
        self.ac_l4 = nn.LeakyReLU()
        self.dp_l4 = nn.Dropout(0.3)

        self.l5 = nn.Linear(self.hidden_size,self.hidden_size)
        self.br_l5 = nn.BatchNorm1d(self.hidden_size)
        self.ac_l5 = nn.LeakyReLU()
        self.dp_l5 = nn.Dropout(0.3)
        """
        self.cls = nn.Linear(self.hidden_size,self.out_shape)
        
    def forward(self,batch, device):

        inputs = batch[0]
        inputs = inputs[:,self.idx_start_tokens:].to(device)

        x = self.l1(inputs)
        x = self.br_l1(x)
        x = self.ac_l1(x)
        x = self.dp_l1(x)
        
        x = self.l2(x)
        x = self.br_l2(x)
        x = self.ac_l2(x)
        x = self.dp_l2(x)
        
        x = self.l3(x)
        x = self.br_l3(x)
        x = self.ac_l3(x)
        x = self.dp_l3(x)
        """
        x = self.l4(x)
        x = self.br_l4(x)
        x = self.ac_l4(x)
        x = self.dp_l4(x)

        x = self.l5(x)
        x = self.br_l5(x)
        x = self.ac_l5(x)
        x = self.dp_l5(x)
        """
        x = self.cls(x)
        
        return x.cpu().float()

class Sauron(nn.Module):

    def __init__(self):

        super().__init__()

        self.folder = '../models/Sauron/'

        self.input_shape = 9000
        self.out_shape = 1314

        self.input_size = 40 + 2500*2 + 4*1314
        self.hidden_size = 5000

        self.blast_conv = Blast_conv()#torch.load('../models/Blast_conv/50.ckpt')
        self.blast_conv.cls = nn.Identity()
        #for p in self.blast_conv.parameters():
           # p.requires_grad = False

        self.tokenet = TokeNet()#torch.load('../models/TokeNet/47.ckpt')
        self.tokenet.cls = nn.Identity()
        #for p in self.tokenet.parameters():
            #p.requires_grad = False
 
        self.l1 = nn.Linear(self.input_size,self.hidden_size)
        self.br_l1 = nn.BatchNorm1d(self.hidden_size)
        self.ac_l1 = nn.LeakyReLU()
        self.dp_l1 = nn.Dropout(0.3)
        
        self.l2 = nn.Linear(self.hidden_size,self.hidden_size)
        self.br_l2 = nn.BatchNorm1d(self.hidden_size)
        self.ac_l2 = nn.LeakyReLU()
        self.dp_l2 = nn.Dropout(0.2)
        """
        self.l3 = nn.Linear(self.hidden_size,self.hidden_size)
        self.br_l3 = nn.BatchNorm1d(self.hidden_size)
        self.ac_l3 = nn.LeakyReLU()
        self.dp_l3 = nn.Dropout(0.2)
        """
        self.cls = nn.Linear(self.hidden_size,self.out_shape)
        

    def forward(self,batch, device):

        inputs = batch[0]
        x1 = inputs[:,0:40].to(device)

        x2 = self.blast_conv(batch,device).to(device)
        x3 = self.tokenet(batch,device).to(device)

        x = torch.cat([x1,x2,x3],dim=1)

        x = self.l1(x)
        x = self.br_l1(x)
        x = self.ac_l1(x)
        x = self.dp_l1(x)
        
        x = self.l2(x)
        x = self.br_l2(x)
        x = self.ac_l2(x)
        x = self.dp_l2(x)
        """
        x = self.l3(x)
        x = self.br_l3(x)
        x = self.ac_l3(x)
        x = self.dp_l3(x)
        """
        x = self.cls(x)
        
        return x.cpu().float()



In [32]:
device = torch.device('cuda')
print(torch.cuda.get_device_name(0))
print(torch.cuda.get_device_properties(0))
model = Sauron().to(device)
scaler = torch.cuda.amp.GradScaler()

GeForce GTX 1070 with Max-Q Design
_CudaDeviceProperties(name='GeForce GTX 1070 with Max-Q Design', major=6, minor=1, total_memory=8192MB, multi_processor_count=16)


In [33]:
from topk.svm import SmoothTopkSVM, MaxTopkSVM 
from sklearn import metrics
from sklearn.utils.class_weight import compute_class_weight

weights = torch.from_numpy(compute_class_weight(class_weight='balanced',classes=labs,y=y))
epochs = 90

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-3)
#optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,patience=5,mode='max')

criterion = torch.nn.BCEWithLogitsLoss(pos_weight=weights)#pos_weight=weights
#criterion = MaxTopkSVM (1314,k=10,alpha=1)

folder = model.folder

In [34]:
print('TRAINING...')

training_stats = []

total_steps = 0

optimizer.zero_grad() 

with tqdm(total=epochs,leave=False) as pbar:
  for epoch_i in range(0, epochs):
    total_train_loss = 0
    model.train()

    #optimizer.zero_grad()
    """
    if epoch_i == 20:
      criterion = MaxTopkSVM(1314,alpha=1,k=10)
    """
    logits = []
    ground_truth = []
    for step, batch in tqdm(enumerate(train_dataloader),total=len(train_dataloader),leave=False):

        optimizer.zero_grad()

        b_labels = batch[1]
        with torch.cuda.amp.autocast(enabled=False):
          b_logits = model(batch,device)
        
        loss = criterion(b_logits,b_labels.detach()).cuda()
        """
        if epoch_i < 20:
          loss = criterion(b_logits,b_labels.detach()).cuda()
        else:
          loss = criterion(b_logits,torch.argmax(b_labels.detach(),dim=1)).cuda()
        """

        scaler.scale(loss).backward()
        total_train_loss += loss.item()
        
        logits.extend(b_logits.detach().numpy())
        ground_truth.extend(np.argmax(b_labels.detach().numpy(),axis=1))    

        scaler.step(optimizer)
        scaler.update()
        total_steps+=1

    y_top10_idx = np.argpartition(logits, -10, axis=1)[:, -10:]
    y_labels = np.argmax(logits,axis=1)

    train_top10 = top10_accuracy_scorer(ground_truth,y_top10_idx)
    train_acc = round(metrics.accuracy_score(ground_truth,y_labels),3)
    avg_train_loss = total_train_loss/len(train_dataloader)

    model.eval()

    total_dev_loss = 0

    logits = []
    ground_truth = []
            
    for step, batch in enumerate(valid_dataloader):

        inputs = batch[0].to(device)
    
        b_labels = batch[1]
        with torch.cuda.amp.autocast(enabled=False):
          with torch.no_grad():
            b_logits = model(batch,device)
        loss = criterion(b_logits,b_labels.detach()).cuda()
        """
        if epoch_i < 20:
          loss = criterion(b_logits,b_labels.detach()).cuda()
        else:
          loss = criterion(b_logits,torch.argmax(b_labels.detach(),dim=1)).cuda()
        """
        scaler.scale(loss)
        total_dev_loss += loss.item()

        logits.extend(b_logits.float().detach().numpy())
        ground_truth.extend(np.argmax(b_labels.detach().numpy(),axis=1))


    y_top10_idx = np.argpartition(logits, -10, axis=1)[:, -10:]
    y_labels = np.argmax(logits,axis=1)

    test_top10 = top10_accuracy_scorer(ground_truth,y_top10_idx)
    test_acc = round(metrics.accuracy_score(ground_truth,y_labels),3)
    avg_dev_loss = total_dev_loss/len(valid_dataloader)

    scheduler.step(test_top10)
 
    training_stats.append(
        {
            'train_loss': avg_train_loss,
            'dev_loss': avg_dev_loss,
            'train_acc': train_acc,
            'train_top10':train_top10,
            'dev_acc': test_acc,
            'dev_top10': test_top10
        }
      )
    
    torch.save(model,folder+str(epoch_i)+'.ckpt')
    
    pbar.update(1)

    print('\nEpoch: ',epoch_i,' train_loss ',avg_train_loss,
            ' dev_loss ',avg_dev_loss,
            ' train acc ',train_acc,
            ' train_top10 ',train_top10,
            ' dev_acc ',test_acc,
            ' dev_top10 ', test_top10
          )

# Show training results
col_names = ['train_loss','train_acc','train_top10','dev_loss', 'dev_acc','dev_top10']
training_stats = pd.DataFrame(training_stats,columns=col_names)
make_plot(training_stats)

TRAINING...


HBox(children=(FloatProgress(value=0.0, max=90.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=7588.0), HTML(value='')))


Epoch:  0  train_loss  0.006889829545553445  dev_loss  0.005974528589323489  train acc  0.024  train_top10  0.07324305624196896  dev_acc  0.068  dev_top10  0.14114158636026686


HBox(children=(FloatProgress(value=0.0, max=7588.0), HTML(value='')))

KeyboardInterrupt: 

In [31]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

## SVC

In [46]:
model.cls = nn.Identity()

In [63]:
def getFeatures(dataloader):
    X = []
    y = []
    model.eval()
    for step, batch in tqdm(enumerate(dataloader),total=len(dataloader),leave=False):
        with torch.no_grad():
            with torch.cuda.amp.autocast(enabled=True):
                label = batch[1].detach().numpy()

                x = model(batch,device).cpu().detach().numpy()

                X.extend(x)
                y.extend(label)
    
    return np.array(X), np.array(y)

In [64]:
X_train, y_train = getFeatures(train_dataloader)
X_test, y_test = getFeatures(valid_dataloader)

HBox(children=(FloatProgress(value=0.0, max=475.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=53.0), HTML(value='')))

In [65]:
def t_labels(y):
    tmp = []
    for l in y:
        tmp.append(labs[np.argmax(l)])
    return np.array(tmp)

y_train = t_labels(y_train)
y_test = t_labels(y_test)

In [66]:
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [67]:
model1 = RandomForestClassifier()

model1.fit(X_train, y_train)

In [None]:
preds = model1.predict(X_train)

acc = accuracy_score(y_train,preds)
f1 = f1_score(y_train,preds,average='macro')

print('ACC: ',acc)
print('F1: ', f1)

In [None]:
preds = model1.predict(X_test)

acc = accuracy_score(y_test,preds)
f1 = f1_score(y_test,preds,average='macro')

print('ACC: ',acc)
print('F1: ', f1)

## SUBMISSION

In [47]:
model = torch.load('../models/Sauron/25.ckpt').to(device)
model

Sauron(
  (blast_conv): Blast_conv(
    (c1): Conv1d(1, 2, kernel_size=(5,), stride=(5,))
    (br_c1): BatchNorm1d(2628, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (dp_c1): Dropout(p=0.2, inplace=False)
    (l1): Linear(in_features=2628, out_features=5256, bias=True)
    (br_l1): BatchNorm1d(5256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (ac_l1): LeakyReLU(negative_slope=0.01)
    (dp_l1): Dropout(p=0.2, inplace=False)
    (cls): Identity()
  )
  (tokenet): TokeNet(
    (l1): Linear(in_features=2500, out_features=5000, bias=True)
    (br_l1): BatchNorm1d(5000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (ac_l1): LeakyReLU(negative_slope=0.01)
    (dp_l1): Dropout(p=0.2, inplace=False)
    (cls): Identity()
  )
  (l1): Linear(in_features=10296, out_features=10296, bias=True)
  (br_l1): BatchNorm1d(10296, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (ac_l1): LeakyReLU(negative_slope=0.01)
  (

In [28]:
model.eval()

logits = []
           
for step, batch in enumerate(test_dataloader):
    
    with torch.cuda.amp.autocast():
        with torch.no_grad():
            b_logits = model(batch,device)
            logits.extend(b_logits.float())

In [29]:
probas = nn.Softmax(dim=1)(torch.stack(logits)).detach().numpy()
probas.shape

(18816, 1314)

In [30]:
submission_format = pd.read_csv('../data/raw/submission_format.csv', index_col='sequence_id')

In [31]:
assert submission_format.shape == probas.shape
assert (labs == submission_format.columns).all()

In [32]:
my_submission = pd.DataFrame(data=probas, 
                             columns=labs, 
                             index=submission_format.index)

In [33]:
my_submission.head()

Unnamed: 0_level_0,00Q4V31T,012VT4JK,028IO5W2,03GRNN7N,03Y3W51H,09MQV1TY,0A4AHRCT,0A9M05NC,0B9GCUVV,0CL7QVG8,...,ZQNGGY33,ZSHS4VJZ,ZT1IP3T6,ZU6860XU,ZU6TVFFU,ZU75P59K,ZUI6TDWV,ZWFD8OHC,ZX06ZDZN,ZZJVE4HO
sequence_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E0VFT,3.478298e-35,3.414078e-29,5.97574e-34,7.448131e-37,6.236244e-36,2.220257e-33,2.4311450000000002e-27,2.508318e-27,2.879273e-32,3.858228e-34,...,6.5630669999999995e-34,6.638447e-36,9.164094e-35,1.417228e-31,3.69706e-32,2.5264820000000003e-31,3.676009e-31,2.708901e-35,2.5516559999999998e-30,2.538643e-25
TTRK5,2.331653e-33,7.911661e-39,6.080588000000001e-23,1.112369e-32,2.47457e-27,5.092798e-33,4.844973e-27,1.197339e-23,9.353e-30,5.034968e-31,...,1.5759e-41,1.58194e-28,3.297046e-28,2.259915e-33,2.174637e-29,1.035901e-25,5.705366000000001e-31,2.361252e-27,7.668246000000001e-39,2.824361e-31
2Z7FZ,2.559014e-25,1.208792e-25,1.5946700000000001e-27,3.6866079999999997e-26,2.78699e-29,7.155356999999999e-19,4.279096e-22,1.495801e-24,9.952289000000001e-33,1.781663e-30,...,3.0216009999999998e-24,1.421314e-26,2.366701e-25,1.079008e-27,2.130436e-23,1.592272e-24,3.443529e-25,4.2013970000000005e-27,1.0179050000000001e-25,5.172437e-21
VJI6E,4.3360909999999994e-38,3.0127920000000002e-43,1.2652860000000001e-33,3.8265869999999996e-38,2.162204e-42,6.381429e-40,6.695681e-32,2.827487e-37,5.0891820000000005e-25,2.246151e-35,...,7.086933e-31,1.401298e-45,1.5003039999999999e-30,3.589339e-35,1.1166110000000001e-33,4.067254e-35,3.975409e-31,2.709375e-35,5.380131e-32,1.09304e-32
721FI,2.195451e-31,6.848312e-25,3.3952670000000003e-28,3.6196870000000002e-31,1.224524e-30,7.651352e-28,3.4977750000000003e-25,6.554407e-31,6.47998e-29,3.740214e-34,...,9.739225999999999e-21,9.282143000000001e-29,2.592318e-30,6.926968000000001e-27,6.897903e-29,4.310105e-26,2.655379e-26,1.277509e-21,1.827762e-29,1.238589e-32


In [34]:
my_submission.to_csv('../submissions/submission_mlp_sau_bl.csv')