In [1]:
import sys
print(sys.executable)
print(sys.version)
print(sys.version_info)

/usr/local/bin/python3.8
3.8.0 (v3.8.0:fa919fdf25, Oct 14 2019, 10:23:27) 
[Clang 6.0 (clang-600.0.57)]
sys.version_info(major=3, minor=8, micro=0, releaselevel='final', serial=0)


In [2]:
# !pip install tensorboard --user
# !pip install numpy 
# !pip install pandas
# !pip install torch
# !pip install scikit
# !pip install tqdm
# !pip install --upgrade pip
# !pip install nltk

In [2]:
from datetime import datetime

import nltk
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
# from torch.utils.tensorboard import SummaryWriter
# from query import get_embedding_model, get_keyed_word_vectors_pickle

In [3]:

def get_word_embedding(model, word: str):
    """
    Directly return word embedding
    """
    try:  # if loaded directly from embedding model, e.g., FastText
        return model.wv[word]
    except AttributeError:  # if we use a pseudo-model, Keyed Word Vectors over Vocabulary
        try: 
            return model[word]
        except:
            return model(word)
            


def get_sentence_tensor(embedding_model, sentence: str, seq_len: int = 50):
    """
    Assemble a sentence tensor by directly loading word embeddings from a pre-trained embedding model up to max length
    """
    sent_arr = []
    for i, word in enumerate(word_tokenize(sentence)):
        if i > seq_len:
            break
        sent_arr.append(get_word_embedding(embedding_model, word))
    sent_tensor = torch.FloatTensor(np.array(sent_arr))
    return sent_tensor

In [4]:
class EarlyTrainingStop:
    """
    Implement a class for early stopping of training when validation loss starts increasing
    """

    def __init__(self, validation_loss: float = np.inf, delta: float = 0.0, counter: int = 0, patience: int = 1):
        self.validation_loss = validation_loss
        self.delta = delta
        self.counter = counter
        self.patience = patience

    def early_stop(self, validation_loss: float):
        if self.validation_loss <= validation_loss + self.delta:
            self.counter += 1
            if self.counter > self.patience:
                return True
        else:
            self.counter = 0
            self.validation_loss = validation_loss

In [5]:
# pad a batch of sentence tensors
def pad_batch(batch_sent_arr):
    """
    Provide a batch (list) of tensor sentences and pad them to the maximal size
    Return a batch (list) of same-size sentences
    """
    max_len = max([x.shape[0] for x in batch_sent_arr])
    padded_batch = []
    for train_sents in batch_sent_arr:
        padded_train_sents = torch.zeros(max_len, train_sents.shape[1], dtype=torch.float32)
        padded_train_sents[:train_sents.shape[0]] = train_sents
        padded_batch.append(padded_train_sents)
    return padded_batch


def batch_str_to_batch_tensors(sentence_list, embedding_model, seq_len: int = 50):
    """
    Convert a list of batch sentences to a batch tensor
    """
    # create a list of word embeddings per sentence
    batch_sent_arr = [get_sentence_tensor(embedding_model=embedding_model,
                                          sentence=str(sent),
                                          seq_len=seq_len) for sent in sentence_list]
    # ensure all sentences (tensors) in the batch have the same length, hence padding
    batch_sent_arr_padded = pad_batch(batch_sent_arr)
    # stack sentence tensors onto each other for a batch tensor
    batch_sent_tensor = torch.stack(batch_sent_arr_padded)
    return batch_sent_tensor

In [6]:
class LSTM(nn.Module):
    def __init__(self, input_size=300, hidden_size=256, num_layers=2, label_size=2, bidirectional=True,
                 batch_first=True):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = True
        dt = datetime.now().strftime("%Y-%m-%d-%H-%M")
        self.name = f'LSTM_bin_classifier-{dt}.pt'
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers,
                            bidirectional=bidirectional, batch_first=batch_first)
        if bidirectional:
            self.D = 2
        else:
            self.D = 1
        self.hidden2label = nn.Linear(in_features=self.D * hidden_size, out_features=label_size)

    def forward(self, sent):
        out, _ = self.lstm(sent)
        out = out[:, -1, :]
        out = self.hidden2label(out)
        return F.softmax(out, dim=1)

In [7]:
from sklearn.utils import resample

In [None]:
class FNS2021(Dataset):
    def __init__(self, file: str, type: str = 'training', train_ratio: float = 0.9, random_state: int = 1,
                 downsample_rate: float = None):
        """
        Custom class for FNS 2021 Competition to load training and validation data. \
        Original validation data is used as testing
        """
        self.total_data_df = pd.read_csv(file).drop(columns=['Unnamed: 0'], errors='ignore')
        self.total_data_df.index.name = 'sent_index'
        self.total_data_df.reset_index(inplace=True)
        if type == 'testing':
            self.sent_labels_df = self.total_data_df
        else:
            train_df, validation_df = train_test_split(self.total_data_df, test_size=1 - train_ratio,
                                                    random_state=random_state, stratify=self.total_data_df.label)
            if type == "training":
                if downsample_rate is not None:
                    train_df = self.downsample(df=train_df, rate=downsample_rate, random_state=random_state)
                self.sent_labels_df = train_df
            elif type == "validation":
                self.sent_labels_df = validation_df
        self.sent_labels_df.reset_index(drop=True, inplace=True)
    
    def downsample(self, df: pd.DataFrame, rate: float = 0.5, random_state: int = 1):
        summary_df = df.loc[df['label'] == 1]
        non_summary_df = df.loc[df['label'] == 0]
        non_summary_df = resample(non_summary_df,
                                  replace=True,
                                  n_samples=int(len(non_summary_df) * (1 - rate)),
                                  random_state=random_state)
        df = pd.concat([summary_df, non_summary_df]).sort_values(['sent_index'])#.reset_index(drop=True)
        # TODO: Downsample only when report data is predominantly 0-labeled
        return df

    def __len__(self):
        return len(self.sent_labels_df)

    def __getitem__(self, idx):
        sent = self.sent_labels_df.loc[idx, 'sent']
        label = self.sent_labels_df.loc[idx, 'label']
        return sent, label

In [None]:
root = '..'
config = {'batch_size': 16}
print('Loading Training Data')
data_filename = 'training_corpus_2023-02-07 16-33.csv'
training_data = FNS2021(file=f'{root}/tmp/{data_filename}', type='training', downsample_rate=None)  # aggressive downsample
# train_dataloader = DataLoader(training_data, batch_size=config.batch_size, drop_last=True)

In [None]:
training_data.total_data_df

In [None]:
training_data.sent_labels_df

In [None]:
print('Loading Validation Data')
validation_data = FNS2021(file=f'{root}/tmp/{data_filename}', training=False,
                            downsample_rate=None)  # use all validation data
# validation_dataloader = DataLoader(validation_data, batch_size=config.batch_size, drop_last=True)

In [None]:
validation_data.sent_labels_df

In [None]:
validation_data.sent_labels_df.loc[validation_data.sent_labels_df.sent_index.isin(training_data.sent_labels_df.sent_index)]

In [None]:
testing_df = pd.read_csv('../tmp/validation_corpus_2023-02-07 16-33.csv')
testing_df

In [None]:
testing_df.loc[validation_data.sent_labels_df.sent_index.isin(training_data.sent_labels_df.sent_index)]

In [None]:
model = LSTM()
model_path = '../tmp/FNS-biLSTM-classification.h5'

model = LSTM(hidden_size=128)
model_path = '../tmp/model-0.0005-128-0.9-2023-02-21-11-29.h5'

model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')), strict=False)
model.eval()

In [None]:
embedding_model = get_embedding_model()

In [None]:
print('Loading Testing Data')
testing_data = FNS2021(file=f'../tmp/validation_corpus_2023-02-07 16-33.csv', type='testing')  # use all testing data

In [21]:
embedding_model = lambda x: torch.rand(1)[0]

In [22]:
embedding_model(1)

tensor(0.9775)

In [None]:
from sklearn.metrics import confusion_matrix
def test(model, embedding_model, test_dataloader, seq_len, device):
    running_acc = 0.0  # Total accuracy for both classes
    running_acc_1 = 0.0  # Accuracy for summary class

    # Initialize lists to store true labels and predicted labels
    true_labels = []
    pred_labels = []

    model.eval()
    with torch.no_grad():
        for i, (test_data, test_labels) in enumerate(test_dataloader):
            batch_sent_tensor = batch_str_to_batch_tensors(sentence_list=test_data, embedding_model=embedding_model,
                                                           seq_len=seq_len).to(device)
            target = test_labels.long().to(device)
            predicted = model(batch_sent_tensor)
            # Calculate and record per-batch accuracy
            winners = predicted.argmax(dim=1)  # each sentence has p0 and p1 probabilities with p0 + p1 = 1
            corrects = (winners == target)  # match predicted output labels with observed labels
            accuracy = corrects.sum().float() / float(target.size(0))
            running_acc += accuracy
            summary_winners = ((winners == target) * (target == 1)).float()
            summary_winners_perc = summary_winners.sum() / max((target == 1).sum(), 1)
            running_acc_1 += summary_winners_perc.sum()
            # Prepare data for confusion matrix split
            true_labels += target.cpu().numpy().tolist()
            pred_labels += winners.cpu().numpy().tolist()
            # Find False Positives and True Negatives
            cm = confusion_matrix(true_labels, pred_labels)
            # Calculate the false positive rate (FPR)
            fpr = cm[0][1] / (cm[0][1] + cm[1][1])
            # Calculate the true negative rate (TNR)
            tnr = cm[1][1] / (cm[1][0] + cm[1][1])
            # Calculate the precision
            precision = cm[0][0] / (cm[0][0] + cm[0][1])
            # Calculate the recall
            recall = cm[0][0] / (cm[0][0] + cm[1][0])

            last_acc = running_acc / (i + 1)  # total accuracy per batch
            last_acc_1 = running_acc_1 / (i + 1)  # summary sent accuracy per batch
            print('Testing total accuracy: {} summary accuracy: {} precision: {}, recall {}'.format(last_acc, last_acc_1, precision, recall))
        # wandb.log({
        #     "Total Testing Accuracy": last_acc,
        #     "Summary Testing Accuracy": last_acc_1,
        #     "Recall": recall,
        #     "Precision": precision,
        #     'False Positive Rate': fpr,
        #     'True Negative Rate': tnr,
        # })
    return last_acc, last_acc_1

In [None]:
print('Loading Testing Data')
testing_data = FNS2021(file=f'../tmp/validation_corpus_2023-02-07 16-33.csv', type='testing')  # use all testing data

testing_data.sent_labels_df = testing_data.sent_labels_df.loc[testing_data.sent_labels_df.report.isin([31938])]
# testing_data.sent_labels_df = testing_data.sent_labels_df.loc[testing_data.sent_labels_df.report.isin([31938, 31509, 30830, 31290, 32148, 31333, 31469, 30777, 30950,
#        32809, 33054, 32376, 33097, 32389, 33083, 31681, 32149, 31440])]

In [None]:
testing_df.groupby('report').sent.count().reset_index().sent.describe()

In [None]:
testing_data.sent_labels_df.report.unique()

In [None]:
testing_dataloader = DataLoader(testing_data, batch_size=len(testing_data))

In [None]:
test(model=model, embedding_model=embedding_model, test_dataloader=testing_dataloader, seq_len=100, device='cpu')

In [None]:
from tqdm import tqdm

In [None]:
accuracies = []

In [None]:
for i in tqdm(testing_df.report.unique()):
    report = testing_df.loc[testing_df.report == i]
    
    true_labels = []
    pred_labels = []

    batch = batch_str_to_batch_tensors(list(report.sent), embedding_model, 100)
    target = torch.tensor(list(report.label))
    model.eval()
    with torch.no_grad():
        predicted = model(batch)

        running_acc, running_acc_1 = 0, 0
        winners = predicted.argmax(dim=1)  # each sentence has p0 and p1 probabilities with p0 + p1 = 1
        
        corrects = (winners == target)  # match predicted output labels with observed labels
        accuracy = corrects.sum().float() / float(target.size(0))
        running_acc += accuracy
        summary_winners = ((winners == target) * (target == 1)).float()
        summary_winners_perc = summary_winners.sum() / max((target == 1).sum(), 1)
        running_acc_1 += summary_winners_perc.sum()


        # Append true and predicted labels to lists
        true_labels += target.cpu().numpy().tolist()
        pred_labels += winners.cpu().numpy().tolist()
        cm = confusion_matrix(true_labels, pred_labels)
        cm_acc = (cm[0][0] + cm[1][1]) / sum(sum(cm))

        print(i, '--->', running_acc, cm_acc, running_acc_1)
        accuracies.append({
            'report': i,
            'total_acc': running_acc, 
            'summary_acc': running_acc_1
            })

In [None]:
res = pd.DataFrame(accuracies)
res.total_acc = [x.numpy() for x in res.total_acc]
res.summary_acc = [x.numpy() for x in res.summary_acc]
res.to_csv('testing_scores.csv')
res

In [None]:
res.total_acc = [x.numpy() for x in res.total_acc]
res.summary_acc = [x.numpy() for x in res.summary_acc]
res

In [None]:
# 5%|▌         | 1/20 [00:27<08:44, 27.59s/it]31938 ---> tensor(0.7549) tensor(0.8587)
#  10%|█         | 2/20 [01:00<09:09, 30.54s/it]31509 ---> tensor(0.7957) tensor(0.9464)
#  15%|█▌        | 3/20 [01:31<08:46, 30.99s/it]30830 ---> tensor(0.7778) tensor(0.9239)
#  20%|██        | 4/20 [01:43<06:11, 23.22s/it]31290 ---> tensor(0.9528) tensor(0.9028)
#  25%|██▌       | 5/20 [03:55<15:40, 62.73s/it]32148 ---> tensor(0.7466) tensor(0.9663)
#  30%|███       | 6/20 [04:24<11:55, 51.12s/it]31333 ---> tensor(0.8972) tensor(0.9735)
#  35%|███▌      | 7/20 [06:08<14:51, 68.57s/it]31469 ---> tensor(0.7980) tensor(0.8135)
#  35%|███▌      | 7/20 [07:06<13:11, 60.89s/it]

In [None]:
train_df = pd.read_csv('../tmp/training_corpus_2023-02-07 16-33.csv')

In [None]:
testing_df.loc[testing_df.report.isin(train_df.report.unique())]