In [2]:
import csv
import numpy as np
from sklearn.metrics import accuracy_score, log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression


# Read sequences
sequences = list()
with open('sequences.txt', 'r') as f:
    for line in f:
        sequences.append(line[:-1])

# Split data into training and test sets
sequences_train = list()
sequences_test = list()
proteins_test = list()
y_train = list()
with open('graph_labels.txt', 'r') as f:
    for i,line in enumerate(f):
        t = line.split(',')
        if len(t[1][:-1]) == 0:
            proteins_test.append(t[0])
            sequences_test.append(sequences[i])
        else:
            sequences_train.append(sequences[i])
            y_train.append(int(t[1][:-1]))

# Map sequences to 
vec = TfidfVectorizer(analyzer='char', ngram_range=(1, 3))
X_train = vec.fit_transform(sequences_train)
X_test = vec.transform(sequences_test)

# Train a logistic regression classifier and use the classifier to
# make predictions
clf = LogisticRegression(solver='liblinear')
clf.fit(X_train, y_train) 

y_pred_proba = clf.predict_proba(X_test)

# Write predictions to a file
with open('sample_submission_seq.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    lst = list()
    for i in range(18):
        lst.append('class'+str(i))
    lst.insert(0, "name")
    writer.writerow(lst)
    for i, protein in enumerate(proteins_test):
        lst = y_pred_proba[i,:].tolist()
        lst.insert(0, protein)
        writer.writerow(lst)

In [24]:
print(clf.predict_proba(X_train).shape)
print(y_pred_proba.shape)

(4888, 18)
(1223, 18)


In [19]:
import csv
import time
import numpy as np
import scipy.sparse as sp
from sklearn.metrics import accuracy_score, log_loss

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim

# Read sequences
sequences = list()
with open('sequences.txt', 'r') as f:
    for line in f:
        sequences.append(line[:-1])

# Split data into training and test sets
sequences_train = list()
sequences_test = list()
proteins_test = list()
y_train = list()
with open('graph_labels.txt', 'r') as f:
    for i,line in enumerate(f):
        t = line.split(',')
        if len(t[1][:-1]) == 0:
            proteins_test.append(t[0])
            sequences_test.append(sequences[i])
        else:
            sequences_train.append(sequences[i])
            y_train.append(int(t[1][:-1]))

In [4]:
def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """
    Function that converts a Scipy sparse matrix to a sparse Torch tensor
    """
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

In [5]:
import numpy as np
print(np.shape(X_train))
print(sequences_train[0].todense())
print(X_train[0])
tensor = sparse_mx_to_torch_sparse_tensor(X_train)
print(tensor)

(4888, 8466)


AttributeError: 'str' object has no attribute 'todense'

# First approach encoding with one hot

In [5]:
import torch.nn.functional as F
from torch import optim
import torch.nn as nn
import torch

class ProtCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, batch_size, dropout=0.5):
        super(ProtCNN, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                          embedding_dim=embed_dim,
                                          padding_idx=0,
                                          max_norm=5.0)
        self.bn1 = nn.BatchNorm1d(1)
        self.bn2 = nn.BatchNorm1d(128)
        self.dropout = nn.Dropout(dropout)
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=128, kernel_size=1, stride=1, dilation=1, padding='same')
        self.conv2 = nn.Conv1d(in_channels=128, out_channels=128, kernel_size=1, stride=1, dilation=2, padding='same')
        self.conv3 = nn.Conv1d(in_channels=128, out_channels=128, kernel_size=3, stride=1, dilation=3, padding='same')
        self.maxpool1d = nn.MaxPool1d(kernel_size=3, stride=None, padding=0, dilation=1, return_indices=False, ceil_mode=False)
        self.relu = nn.ReLU()
        self.fc = nn.Linear(128, 18)
        
        
    def residual_block(self, x_in):
        """
        _data: input
        _filters: convolution filters
        _d_rate: dilation rate
        """

        shortcut = x_in
        
        x = self.bn1(x_in)
        x = self.relu(x)
        x = self.conv2(x)

        #bottleneck convolution
        x = self.bn2(x)
        x = self.relu(x)
        out = self.conv3(x)

        #skip connection
        out += shortcut
        return x
    
    def forward(self, x_in):
        x = self.embedding(x_in)
        x_reshaped = x.permute(0, 2, 1)
        x = self.conv1(x_reshaped)
        x = residual_block(x)
        #x = residual_block(x)
        x = self.maxpool1d(x)
        out = self.dropout(x)
       
        # softmax classifier
        out =  F.log_softmax(out, dim=1)
        
        return out


In [6]:
# Initializes device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Hyperparameters
epochs = 10
batch_size = 64
learning_rate = 0.01

# Extract dataset

# Read sequences
sequences = list()
with open('sequences.txt', 'r') as f:
    for line in f:
        sequences.append(line[:-1])

# Split data into training and test sets
sequences_train = list()
sequences_test = list()
proteins_test = list()
y_train = list()
with open('graph_labels.txt', 'r') as f:
    for i,line in enumerate(f):
        t = line.split(',')
        if len(t[1][:-1]) == 0:
            proteins_test.append(t[0])
            sequences_test.append(sequences[i])
        else:
            sequences_train.append(sequences[i])
            y_train.append(int(t[1][:-1]))

# Map sequences to 
vec = TfidfVectorizer(analyzer='char', ngram_range=(1, 3))
X_train = vec.fit_transform(sequences_train)
X_test = vec.transform(sequences_test)

N_train = X_train.shape[0]
N_test = X_test.shape[0]

# Initializes model and optimizer
model = ProtCNN(batch_size, dropout=0.5).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_function = nn.CrossEntropyLoss()


In [7]:
# Eval
for epoch in range(epochs):
    t = time.time()
    model.train()
    
    train_loss = 0
    correct = 0
    count = 0
    for i in range(0, N_train, batch_size):

        X_batch = X_train[i:min(N_train, i+batch_size)]
        X_batch = sparse_mx_to_torch_sparse_tensor(X_batch).to(device)
        y_batch = torch.LongTensor(y_train[i:min(N_train, i+batch_size)]).to(device)

        optimizer.zero_grad()
        output = model(X_batch)
        print(output.size(), y_batch.size())
        loss = loss_function(output, y_batch)
        train_loss += loss.item() * output.size(0)
        count += output.size(0)
        preds = output.max(1)[1].type_as(y_batch)
        correct += torch.sum(preds.eq(y_batch).double())
        loss.backward()
        optimizer.step()
    
    if epoch % 10 == 0:
        print('Epoch: {:04d}'.format(epoch+1),
              'loss_train: {:.4f}'.format(train_loss / count),
              'acc_train: {:.4f}'.format(correct / count),
              'time: {:.4f}s'.format(time.time() - t))
        
print('Optimization finished!')

RuntimeError: Given groups=1, weight of size [128, 1, 1], expected input[1, 64, 8466] to have 1 channels, but got 64 channels instead

In [None]:
# Evaluates the model
model.eval()
test_loss = 0
correct = 0
count = 0
for i in range(0, N_test, batch_size):
    adj_batch = list()
    idx_batch = list()
    y_batch = list()

    ############## Task 7
    
    ##################
    g_batch = G_test[i:i+batch_size]
    adj_batch = sp.block_diag([nx.adjacency_matrix(g) for g in g_batch])
    adj_batch = sparse_mx_to_torch_sparse_tensor(adj_batch)

    for j, G in enumerate(g_batch):
        idx_batch.extend(G.number_of_nodes()*[j])
    idx_batch = torch.LongTensor(idx_batch).to(device)
    features_batch = torch.ones((len(idx_batch), 1)).to(device)
    y_batch = torch.LongTensor(y_test[i:i+batch_size]).to(device)
    ##################

    output = model(features_batch, adj_batch, idx_batch)
    loss = loss_function(output, y_batch)
    test_loss += loss.item() * output.size(0)
    count += output.size(0)
    preds = output.max(1)[1].type_as(y_batch)
    correct += torch.sum(preds.eq(y_batch).double())

print('loss_test: {:.4f}'.format(test_loss / count),
      'acc_test: {:.4f}'.format(correct / count),
      'time: {:.4f}s'.format(time.time() - t))
