In [1]:
import pandas as pd 
import numpy as np 

from sklearn.model_selection import train_test_split

from tqdm import tqdm

from sklearn.metrics import accuracy_score, f1_score

In [2]:
train = pd.read_csv('../data/processed/train.csv')
test = pd.read_csv('../data/processed/test.csv')
print('Train: ',train.shape)
print('Test: ',test.shape)

Train:  (67447, 43)
Test:  (18816, 42)


train_ngram_features = pd.read_csv('../data/features/ngram/5_ngram_train.csv')
test_ngram_features = pd.read_csv('../data/features/ngram/5_ngram_test.csv')

print('Train: ',train_ngram_features.shape)
print('Test: ',test_ngram_features.shape)

In [3]:
FOLD = 10
df = pd.DataFrame()
for f_idx in tqdm(range(1,FOLD+1)):
    path = '../data/features/blast/'+str(FOLD)+'/dev_result_'+str(f_idx)+'.csv'
    new_df = pd.read_csv(path)
    df = pd.concat([df, new_df], ignore_index=True,axis=0)
del new_df
print(df.shape)

100%|██████████| 10/10 [06:15<00:00, 37.51s/it](66739, 14455)



In [4]:
df.sequence_id.duplicated().any()

False

In [5]:
%%time
#df = df.groupby('sequence_id', as_index=False).mean()
#print(df.shape)

Wall time: 0 ns


In [6]:
#00Q4V31Thits,00Q4V31Tidentity,00Q4V31Talignment length,00Q4V31Tmismatches,00Q4V31Tgap opens,00Q4V31Tq. start,00Q4V31Tq. end,00Q4V31Ts. start,00Q4V31Ts. end,00Q4V31Tevalue,00Q4V31Tbit score,
print(df.shape)
columnVals = df.columns.map(lambda x: ('hits' in x) | ('bit score in x' in x) | ('sequence_id' in x))
#columnVals = df.columns.map(lambda x: ('hits' in x) | ('identity' in x) | ('alignment length' in x) | ('mismatches' in x) | ('gap opens' in x) | ('bit score in x' in x) | ('sequence_id' in x))
df = df.loc[:,columnVals]
df.shape

(66739, 14455)


(66739, 1315)

In [7]:
%%time
blast = pd.merge(train,df,how='left',on='sequence_id')
blast.shape

Wall time: 2.63 s


(67447, 1357)

In [8]:
blast.fillna(0,inplace=True)
#blast.dropna(inplace=True)

In [9]:
def top10_accuracy_scorer(estimator, X, y):

    #probas = estimator.predict_proba(X)
    
    top10_idx = np.argpartition(probas, -10, axis=1)[:, -10:]
    
    top10_preds = estimator.classes_[top10_idx]

    mask = top10_preds == np.reshape(np.array(y.values.ravel()),(y.shape[0],1))
    
    top_10_accuracy = mask.any(axis=1).mean()
 
    return top_10_accuracy

In [75]:
# Rename our feature array
#train.drop(['sequence','sequence_id'],inplace=True,axis=1)
blast.drop(['sequence_id','sequence'],inplace=True,axis=1)

KeyError: "['sequence_id' 'sequence'] not found in axis"

In [77]:
X = blast.drop('target',inplace=False,axis=1)
y = blast['target']

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=420,stratify=y)

In [79]:
import torch

In [80]:
labs = pd.read_csv('../data/raw/train_labels.csv').columns[1:]

lab_pos = dict()
i = 0
for lab in labs:
    lab_pos[lab]=i

def get_targets(y):
    targets = []
    for lab in y:
        tmp = np.zeros(len(labs))
        idx = lab_pos[lab]
        tmp[idx] = 1
        targets.append(tmp)
    targets = torch.tensor(targets)
    print(targets.shape)
    return targets

In [81]:
X_train = torch.tensor(X_train.values.astype(np.float32))
y_train = get_targets(y_train)

torch.Size([53957, 1314])


In [82]:
X_test = torch.tensor(X_test.values.astype(np.float32)) 
y_test = get_targets(y_test) 

torch.Size([13490, 1314])


In [83]:
train_dataset = torch.utils.data.TensorDataset(X_train, y_train) 
train_dataloader = torch.utils.data.DataLoader(dataset = train_dataset, 
    batch_size = 16, 
    sampler = torch.utils.data.RandomSampler(train_dataset)
)

In [84]:
test_dataset = torch.utils.data.TensorDataset(X_test, y_test) 
test_dataloader = torch.utils.data.DataLoader(dataset = test_dataset, 
    batch_size = 32, 
    sampler = torch.utils.data.SequentialSampler(test_dataset)
)

In [85]:
print(X_train.shape)
print(X_test.shape)

print(y_train.shape)
print(y_test.shape)

torch.Size([53957, 1354])
torch.Size([13490, 1354])
torch.Size([53957, 1314])
torch.Size([13490, 1314])


In [95]:
import torch.nn as nn

class Net(nn.Module):

    def __init__(self):
        super().__init__()

        self.ac = nn.ReLU()
        
        self.l1 = nn.Linear(1354,1354*2)
        self.l2 = nn.Linear(1354*2,1354*2)
        self.l3 = nn.Linear(1354*2,1354*2)
        self.l4 = nn.Linear(1354*2,1354*2)
        self.l5 = nn.Linear(1354*2,1314)
    
    def forward(self,inputs):

        x = self.l1(inputs)
        x = self.ac(x)

        x = self.l2(x)
        x = self.ac(x)

        x = self.l3(x)
        x = self.ac(x)

        x = self.l4(x)
        x = self.ac(x)

        x = self.l5(x)

        return x.cpu()


In [103]:
epochs = 50

device = torch.device('cuda')

model = Net()
model.to(device)

learning_rate = 2e-2
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

criterion = torch.nn.BCEWithLogitsLoss()

In [104]:
for epoch_i in range(epochs):
    print('EPOCH: ',epoch_i)
    ground_truth = []
    preds = []
    total_loss=0
    print('Training...')
    for step, batch in tqdm(enumerate(train_dataloader),total=len(train_dataloader)):

        inputs = batch[0].to(device)
        b_target = batch[1]

        b_logits = model(inputs)

        loss=criterion(b_target,b_logits)
        total_loss+=loss.item()

        preds.extend(nn.Sigmoid()(b_logits).detach().numpy())
        ground_truth.extend(b_target.detach().numpy())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    acc = accuracy_score(ground_truth,preds)
    f1 = f1_score(ground_truth,preds,average='macro')
    #top = top10_accuracy_scorer(model, X_train, y_train)

    print('Loss: ',total_loss/len(train_dataloader))
    print('ACC: ',acc)
    print('F1: ', f1)
    #print('TOP-10: ',top)

0%|          | 1/3373 [00:00<06:47,  8.26it/s]EPOCH:  0
Training...
100%|██████████| 3373/3373 [04:48<00:00, 11.71it/s]


ValueError: Classification metrics can't handle a mix of multilabel-indicator and continuous-multioutput targets