In [1]:
import copy
import os
import pickle
import sys

import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics

In [2]:
!conda install --yes --prefix {sys.prefix} pytorch torchvision torchaudio -c pytorch

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [3]:
%pip install pytorch-crf

Note: you may need to restart the kernel to use updated packages.


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torchcrf import CRF

In [5]:
def load_data(train_label_file, test_label_file, train_data_file, test_data_file):
    PATH_TO_SAVED_DATA = os.getcwd() + "/Processed Data/"

    # Load Data
    training_labels = pickle.load(open(PATH_TO_SAVED_DATA + train_label_file, "rb"))
    testing_labels = pickle.load(open(PATH_TO_SAVED_DATA + test_label_file, "rb"))
    training_data = pickle.load(open(PATH_TO_SAVED_DATA + train_data_file, "rb"))
    testing_data = pickle.load(open(PATH_TO_SAVED_DATA + test_data_file, "rb"))
    
    # Convert labels to same format as data
    training_labels = np.asarray([[label] for label in training_labels])
    testing_labels = np.asarray([[label] for label in testing_labels])
    
    # Duplicate labels to the length of the sequence
    # Turns the label into a "tag"
    training_labels = np.repeat(training_labels, len(training_data[0]), axis=1)
    testing_labels = np.repeat(testing_labels, len(testing_data[0]), axis=1)
    
    linear_training_data = training_data
    linear_testing_data = testing_data
    
    training_data = np.expand_dims(training_data, axis=2)
    testing_data = np.expand_dims(testing_data, axis=2)
    
    # Convert data and labels to tensors
    dtype = torch.FloatTensor
    training_data = Variable(torch.from_numpy(training_data).type(dtype), requires_grad=False)
    training_labels = Variable(torch.from_numpy(training_labels).type(dtype), requires_grad=False)
    testing_data = Variable(torch.from_numpy(testing_data).type(dtype), requires_grad=False)
    testing_labels = Variable(torch.from_numpy(testing_labels).type(dtype), requires_grad=False)
    
    linear_training_data = Variable(torch.from_numpy(linear_training_data).type(dtype), requires_grad=False)
    linear_testing_data = Variable(torch.from_numpy(linear_testing_data).type(dtype), requires_grad=False)

    #testing_data = 
    #training_labels = 
    #testing_labels = 
    
    # Print confirmation of data and label size
    print("Training labels size:", training_labels.size())
    print("Testing labels size:", testing_labels.size())
    print("Training data size:", training_data.size())
    print("Testing data size:", testing_data.size())
    print("Linear training data size:", linear_training_data.size())
    print("Linear testing data size:", linear_testing_data.size())
    
    
    return training_labels, testing_labels, training_data, testing_data, linear_training_data, linear_testing_data

In [85]:
train_labels, test_labels, train_data, test_data, lin_train_data, lin_test_data = load_data("train_labels.p", 
                                                                                            "test_labels.p", 
                                                                                            "train_data.p", 
                                                                                            "test_data.p")

#lin_train_data = lin_train_data.narrow(1, 0, 15)
#lin_test_data = lin_test_data.narrow(1, 0, 15)

#train_labels = train_labels.narrow(1, 0, 15)
#test_labels = test_labels.narrow(1, 0, 15)

print(lin_train_data)
print(train_labels)
print(lin_test_data)
print(test_labels)
print(lin_train_data.shape)
print(lin_test_data.shape)

Training labels size: torch.Size([3496, 105])
Testing labels size: torch.Size([1498, 105])
Training data size: torch.Size([3496, 105, 1])
Testing data size: torch.Size([1498, 105, 1])
Linear training data size: torch.Size([3496, 105])
Linear testing data size: torch.Size([1498, 105])
tensor([[0.0760, 0.0000, 0.4354,  ..., 0.0156, 0.0625, 0.0313],
        [0.0760, 0.0000, 0.4354,  ..., 0.0625, 0.0313, 0.1250],
        [0.0760, 0.0000, 0.4354,  ..., 0.0625, 0.0781, 0.0625],
        ...,
        [0.0640, 0.0000, 0.4644,  ..., 0.0664, 0.0859, 0.1172],
        [0.0580, 1.0000, 0.4634,  ..., 0.2344, 0.1875, 0.0781],
        [0.0000, 1.0000, 0.4495,  ..., 0.1758, 0.1641, 0.1328]])
tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
tensor([[0.0590, 1.0000, 0.5003,  ..., 0.1250, 0.2188, 0.1563],

In [1]:
def graphResults(epoch, loss, f1, auc):
    """Graph the loss, f1 score, and AUC over each epoch during training.
    
    Keyword arguments:
    epoch -- the number of training iterations
    loss -- array of size [epoch] containing the loss value of each iteration
    f1 -- array of size [epoch] contianing the f1 value of each iteration
    auc -- array of size [epoch] contianing the auc of each iteration
    """
    plt.figure()
    iterations = np.arange(epoch)
    plt.plot(iterations, loss)
    plt.title("Loss During Training")

    plt.figure()
    iterations = np.arange(epoch)
    plt.plot(iterations, f1)
    plt.title("F1 During Training")
    
    plt.figure()
    iterations = np.arange(epoch)
    plt.plot(iterations, auc)
    plt.title("AUC During Training")

    print("Best F1 Epoch:", list(f1).index(max(f1)))
    print("Best AUC Epoch:", list(auc).index(max(auc)))
    

def test_crf(model, emissions, test_labels):
    y_pred = model.predict(emissions, test_labels)
    y_pred = np.asarray([((sum(i) / 105) * 65) for i in y_pred])
    y_score = y_pred
    y_pred = y_pred >= 0.5
    y_pred = y_pred.astype(int)
    y_true = np.asarray(test_labels.detach().numpy())
    y_true = [i[0] for i in y_true]
    
    return y_true, y_pred, y_score

In [9]:
class CRF_Model(nn.Module):
    def __init__(self, batch_size, seq_len, num_tags):
        super().__init__()
        self.batch_size = batch_size
        self.seq_len = seq_len
        self.num_tags = num_tags
        
        # Linear model to calculate emissions
        self.linear = nn.Linear(self.seq_len, self.seq_len)
        self.sigmoid = nn.Sigmoid()
        # CRF tagger
        self.crf = CRF(num_tags, batch_first=True)
    
    def predict(self, x_lin, y):
        y_pred = self.linear(x_lin)
        y_pred = self.sigmoid(y_pred)
        
        emissions = torch.unsqueeze(y_pred, 2)
        
        # Convert emissions final dimension to 2
        # where the final dimension: [P(y=0|x), P(y=1|x)]
        emissions = torch.cat((emissions, y.unsqueeze(2)), dim=-1)
        for i in range(len(emissions)):
            for j in range(len(emissions[i])):
                if emissions[i][j][0] < torch.Tensor([0.5]):
                    emissions[i][j][1] = torch.sub(torch.Tensor([1]), emissions[i][j][0])
                else:
                    emissions[i][j][1] = emissions[i][j][0]
                    emissions[i][j][0] = torch.sub(torch.Tensor([1]), emissions[i][j][1])
        
        #y = y.type(torch.LongTensor)
        
        y_pred = self.crf.decode(emissions)
        return y_pred

    def loss(self, x_lin, y):
        y_pred = self.linear(x_lin)
        y_pred = self.sigmoid(y_pred)
        
        emissions = torch.unsqueeze(y_pred, 2)
        
        # Convert emissions final dimension to 2
        # where the final dimension: [P(y=0|x), P(y=1|x)]
        emissions = torch.cat((emissions, y.unsqueeze(2)), dim=-1)
        for i in range(len(emissions)):
            for j in range(len(emissions[i])):
                if emissions[i][j][0] < torch.Tensor([0.5]):
                    emissions[i][j][1] = torch.sub(torch.Tensor([1]), emissions[i][j][0])
                else:
                    emissions[i][j][1] = emissions[i][j][0]
                    emissions[i][j][0] = torch.sub(torch.Tensor([1]), emissions[i][j][1])
        
        y = y.type(torch.LongTensor)
        
        log_likelihood = self.crf.forward(emissions, y)
        return -log_likelihood

In [156]:
batch_size = len(train_data)
seq_len = len(train_data[0])
num_tags = len(train_data[0][0]) + 1

model = CRF_Model(batch_size, seq_len, num_tags)

models = []

optimizer = torch.optim.SGD(model.parameters(), lr=1e-6)


for epoch in range(3):
    # Forward pass to compute predictions using training_data
    #print("Forward Pass")
    loss = model.loss(lin_train_data, train_labels)
    
    if epoch % 1 == 0:
        print(epoch, loss.item())
        
    y_true, y_pred, y_score = test_crf(model, lin_test_data, test_labels)
    confusion_matrix = metrics.confusion_matrix(y_true, y_pred, labels=[1,0])
    print("Confusion Matrix:\n", confusion_matrix)
    recall = metrics.recall_score(y_true, y_pred)
    print("Recall:", recall)
    accuracy = metrics.accuracy_score(y_true, y_pred)
    print("Accuracy: ", accuracy)
    auc = metrics.roc_auc_score(y_true, y_score)
    print("AUC:      ", auc)
    
    models.append(copy.deepcopy(model))

    # Zero gradients, perform backward pass, and update weights
    #print("Zero Gradients")
    model.zero_grad()
    #print("Backward Pass")
    loss.backward()
    #print("Update Weights")
    optimizer.step()

0 295987.90625
Confusion Matrix:
 [[749   0]
 [745   4]]
Recall: 1.0
Accuracy:  0.5026702269692924
AUC:       0.4993342257856937
1 21597935616.0
Confusion Matrix:
 [[  0 749]
 [  0 749]]
Recall: 0.0
Accuracy:  0.5
AUC:       0.5
2 44510953472.0
Confusion Matrix:
 [[749   0]
 [749   0]]
Recall: 1.0
Accuracy:  0.5
AUC:       0.5


In [155]:
saved_models.append(models[0])
len(saved_models)

15

In [201]:
y_true, y_pred, y_score = test_crf(saved_models[14], lin_test_data, test_labels)

In [202]:
#confusion_matrix = metrics.confusion_matrix(y_true, y_pred, labels=[1,0])
#print("Confusion Matrix:\n", confusion_matrix)

recall = metrics.recall_score(y_true, y_pred)
print("Recall:   ", recall)

accuracy = metrics.accuracy_score(y_true, y_pred)
print("Accuracy: ", accuracy)

#precision = metrics.precision_score(y_true, y_pred)
#print("Precision:", precision)

#f1 = metrics.f1_score(y_true, y_pred)
#print("F1:       ", f1)

auc = metrics.roc_auc_score(y_true, y_score)
print("AUC:      ", auc)

Recall:    0.7797062750333779
Accuracy:  0.6608811748998665
AUC:       0.6822875538546278


In [203]:
CRF_recall.append(recall)
CRF_accuracy.append(accuracy)
CRF_auc.append(auc)

len(CRF_recall)

15

In [204]:
path = os.getcwd() + "/Results/"
pickle.dump(saved_models, open(path + "CRF_Models.p", "wb"))
pickle.dump(CRF_recall, open(path + "CRF_recall.p", "wb"))
pickle.dump(CRF_accuracy, open(path + "CRF_accuracy.p", "wb"))
pickle.dump(CRF_auc, open(path + "CRF_auc.p", "wb"))