In [1]:
# CNN
import torch.nn.functional as F
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn

# others
import numpy as np
import matplotlib.pyplot as plt
import time
import os
from PIL import Image
from tempfile import TemporaryDirectory
import time

# dataset
import torchvision
from torchvision import datasets, models, transforms
from torchvision.datasets import Flowers102
from sklearn.metrics import f1_score


# read file 
import pandas as pd

# label
from scipy.io import loadmat
import json
from tqdm import tqdm
from itertools import islice
from typing import List, Dict, Tuple

# Use TweetEval emotion recognition dataset 

In [None]:
# root = '../../Data/tweeteval/datasets/emotion/'
# mapping_file = os.path.join(root, 'mapping.txt')
# test_labels_file = os.path.join(root, 'test_labels.txt')
# test_text_file = os.path.join(root, 'test_text.txt')
# train_labels_file = os.path.join(root, 'train_labels.txt')
# train_text_file = os.path.join(root, 'train_text.txt')
# val_labels_file = os.path.join(root, 'val_labels.txt')
# val_text_file = os.path.join(root, 'val_text.txt')

In [None]:
# mapping_pd = pd.read_csv(mapping_file, sep='\t', header=None)
# test_label_pd = pd.read_csv(test_labels_file, sep='\t', header=None)
# test_dataset = open(test_text_file).read().split('\n')[:-1] # remove last empty line 
# train_label_pd = pd.read_csv(train_labels_file, sep='\t', header=None)
# train_dataset = open(train_text_file).read().split('\n')[:-1] # remove last empty line
# val_label_pd = pd.read_csv(val_labels_file, sep='\t', header=None)
# val_dataset = open(val_text_file).read().split('\n')[:-1] # remove last empty line

# Preprocess training data
- Given: Notes with ranges and labels
- Transform into label + lists of tokens with [does token describe label]

In [None]:
root = './data/'
features_path = os.path.join(root, 'features.csv')
patient_notes_path = os.path.join(root, 'patient_notes.csv')
sample_submission_path = os.path.join(root, 'sample_submission.csv')
test_path = os.path.join(root, 'test.csv')
train_path = os.path.join(root, 'train.csv')
features = pd.read_csv(features_path, sep=',', header=0)
patient_notes = pd.read_csv(patient_notes_path, sep=',', header=0)
train_raw = pd.read_csv(train_path, sep=',', header=0)


In [None]:
# unusual_numbers = features["feature_num"].value_counts()[features["feature_num"].value_counts() != 1]
# unusual_numbers
features[features["feature_text"] == "Female"]
# features["feature_num"] == 

## intro 
- `case_num`: 0~9, each num belongs their groups ... ? 
- `pn_num`: the id in patient_notes.csv which is 'pn_history', present the note of each case 
- `feature_num`: the id in features.csv which is 'feature_num', present the feature of each case 
- `location`: 

In [None]:
import re
def df_string2list_of_ints(df_string: str):
    df_string = df_string.strip("[]")
    if df_string == "":
        return []
    entries = re.split(",|;", df_string)
    entries = [entry.strip(" '") for entry in entries]
    ranges = [tuple(int(num_as_str) for num_as_str in entry.split(" ")) for entry in entries]
    return ranges

In [None]:
train_raw

In [None]:
data_merged = train_raw.merge(features, on=['feature_num', 'case_num'], how='left')
data_merged = data_merged.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
data_merged["location"] = data_merged["location"].apply(df_string2list_of_ints)
data_merged.head()

In [None]:
train = data_merged[["feature_text", "pn_history", "location", ]]
train.head()

In [None]:
# filter training data with no location
train = train[train["location"].apply(lambda row: len(row) != 0)]

In [None]:
print(f'Size of dataset= {len(train)}')

## Tokenization
- Use spaCy to split the notes into words.

Before start using spaCy
```
conda install -c conda-forge spacy
python -m spacy download en_core_web_sm
```

In [None]:
from typing import Iterable
# from pytorch_pretrained_bert import BertTokenizer

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [None]:
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
# tokenizer.encode_plus("hello i am Drunk", return_offsets_mapping=True, add_special_tokens=True)

In [None]:
from os.path import join as pathjoin
import pickle
cache_dir = "cache"

In [None]:
tokenizer.encode_plus("hello i am Drunk", return_offsets_mapping=True, add_special_tokens=True)

In [None]:
from pytorch_pretrained_bert.modeling import BertModel
BERT_FP = ('bert-base-uncased')

In [None]:
def get_bert_embed_matrix():
    bert = BertModel.from_pretrained(BERT_FP)
    bert_embeddings = list(bert.children())[0]
    bert_word_embeddings = list(bert_embeddings.children())[0]
    mat = bert_word_embeddings.weight.data.numpy()
    return mat

In [None]:
embedding_matrix = get_bert_embed_matrix()

In [None]:
embedding_matrix.shape

In [None]:
from functools import cache, lru_cache

def embed_seq(s: Iterable[int]):
    return np.array([onehot_word(word_id) @ embedding_matrix for word_id in s])

@lru_cache(maxsize=10000)
def embed_word(word_id: int):
    return onehot_word(word_id) @ embedding_matrix

def onehot_word(a: int):
    oh = np.zeros(30522, dtype=int)
    oh[a] = 1
    return oh

In [None]:
from typing import Dict, List

cache_file = pathjoin(cache_dir, "tokenized_pn_histories.pkl")
if os.path.isfile(cache_file):
    with open(cache_file, "rb") as f:
        tokenized_pn_histories = pickle.load(f)
    print("Tokenized patient histories loaded from cache.")
else:
    print("Found no cached tokenized patient histories. Tokenizing...")
    tokenized_pn_histories: Dict[str, List[Dict]] = {}
    for pn_history in tqdm(train["pn_history"]):
        indexed_words = []
        if pn_history in tokenized_pn_histories:
            continue

        tokenized = tokenizer.encode_plus(pn_history, return_offsets_mapping=True, add_special_tokens=True)

        for word, offset_mapping in zip(tokenized["input_ids"], tokenized["offset_mapping"]):
            embedded_token = embed_word(word)

            indexed_words.append({
                "word_id": word,
                "embedded": embedded_token,
                "range": offset_mapping
            })
                    
        tokenized_pn_histories[pn_history] = indexed_words
    with open(cache_file, "wb") as f:
        pickle.dump(tokenized_pn_histories, f)


Data structure:
tokenized_pn_histories
hist_id -> [tokens]
token -> ['word_id', 'embedded', 'range']


In [None]:
# list(tokenized_pn_histories.values())[0][0].keys()
list(tokenized_pn_histories.values())[0][1]

In [None]:
from typing import Dict, List

cache_file = pathjoin(cache_dir, "tokenized_features.pkl")
if os.path.isfile(cache_file):
    with open(cache_file, "rb") as f:
        tokenized_features = pickle.load(f)
    print("Tokenized features loaded from cache.")
else:
    print("Found no cached tokenized features. Tokenizing...")
    tokenized_features: Dict[str, List[str]] = {}
    for feature_text in tqdm(train["feature_text"]):
        indexed_words = []
        if feature_text in tokenized_features:
            continue

        tokenized = tokenizer.encode_plus(feature_text, add_special_tokens=True)

        for word in tokenized["input_ids"]:
            embedded_token = embed_word(word)

            indexed_words.append({
                "word_id": word,
                "embedded": embedded_token,
            })
                
        tokenized_features[feature_text] = indexed_words
    with open(cache_file, "wb") as f:
        pickle.dump(tokenized_features, f)


In [None]:
list(tokenized_features.values())[0][1]

- Follow the example described here. Use the same architecture, but:
  - only use the last output of the LSTM in the loss function
  - use an embedding dim of 128
  - use a hidden dim of 256.  

## Get feature-relevancy of tokens via char ranges

In [None]:
train_data_preprocessed = dict()
for i, (feature_text, pn_history, location) in tqdm(train.iterrows()):
    tokenized_history = tokenized_pn_histories[pn_history]
    tokens_with_scores = []
    for token in tokenized_history:
        percentages = []
        for feature_relevant_range in location:
            token_start, token_end = token["range"]
            range_start, range_end = feature_relevant_range[0], feature_relevant_range[1]
            
            percentage_of_token_in_range = max(min(token_end, range_end)+1 - max(token_start, range_start), 0) / (token_end+1 - token_start)
            percentages.append(percentage_of_token_in_range)
            # if percentage_of_token_in_range > 0:
            #     print(percentage_of_token_in_range, token, feature_relevant_range)
        

        tokens_with_scores.append({"token": token,
                                   "score": int(max(percentages) > 0.9)})

    train_data_preprocessed[i] = {
                                    "pn_history_tokens": [ts["token"] for ts in tokens_with_scores],
                                    "scores": torch.tensor([ts["score"] for ts in tokens_with_scores]),
                                    "feature_tokens": tokenized_features[feature_text],
                                    "locations": location
                                   }
        

In [None]:
num_no_positives = sum([1 for dp in train_data_preprocessed.values() if sum(dp["scores"]) == 0])
print(f"filtering {num_no_positives} out of {len(train_data_preprocessed)} datapoints because they don't contain any positive scores.")
train_data_preprocessed = {key: dp for key, dp in train_data_preprocessed.items() if sum(dp["scores"]) != 0}

# Structure of the Model
Layers in LSTM Model:
1. embed feature tokens
2. lstm feature -> constant size vector

3. pass to 2nd lstm
	

In [None]:
EMBEDDING_DIM = embedding_matrix.shape[1]
HIDDEN_DIM = 256

In [None]:
class LSTMTokenScorer(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, dropout=0.0):
        super(LSTMTokenScorer, self).__init__()

        self.pn_history_hidden_dim = hidden_dim

        # self.bert_embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix))

        self.feature_lstm = nn.LSTM(embedding_dim, embedding_dim, bidirectional=False, dropout=dropout) # the feature is now one tensor of size [embedding_dim].

        self.total_lstm = nn.LSTM(embedding_dim * 2, self.pn_history_hidden_dim, bidirectional=False, dropout=dropout)
        
        self.hidden2score = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()


    def forward(self, pn_history, feature):
        # feature_embeds = self.bert_embedding(feature)
        feature_lstm_out, _ = self.feature_lstm(feature.view(len(feature), 1, -1)) # the feature is now one tensor of size [embedding_dim].
        feature_reduced = torch.squeeze(feature_lstm_out[-1]) #.view(1, -1)
        feature_multiplied = feature_reduced.repeat((len(pn_history), 1)) # duplicate feature vector to be same size as embedded pn_history vector.

        # pn_history_embeds = self.bert_embedding(pn_history)
        pn_history_and_features = torch.concat((feature_multiplied, pn_history), dim=1)

        pn_history_reduced, _ = self.total_lstm(pn_history_and_features)
        pred_score_raw = torch.squeeze(self.hidden2score(pn_history_reduced))
        pred_score = self.sigmoid(pred_score_raw)
        return pred_score

In [None]:
# all_scores = [d["scores"].numpy() for d in train_data_preprocessed.values()]
# avg_neg_div_pos = np.mean([(scores.shape[0] - np.sum(scores)) / np.sum(scores) for scores in all_scores])

In [None]:
train_data_preprocessed[0].keys()

In [None]:
# feature_tokens = train_data_preprocessed[0]["feature_tokens"]
# pn_history_tokens = train_data_preprocessed[0]["pn_history_tokens"]

# feature_tensor = torch.tensor(np.array([t["embedded"] for t in feature_tokens]), dtype=torch.float)
# pn_history_tensor = torch.tensor(np.array([t["embedded"] for t in pn_history_tokens]), dtype=torch.float)

# model(pn_history_tensor, feature_tensor)

In [None]:
# From https://www.kaggle.com/theoviel/evaluation-metric-folds-baseline

def micro_f1(preds, truths):
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    return f1_score(truths, preds)


def spans_to_binary(spans, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
    return binary


def span_micro_f1(preds: List[List[Tuple[int, int]]], truths: List[List[Tuple[int, int]]]):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
    return micro_f1(bin_preds, bin_truths)


In [None]:
span_micro_f1([[(1,3)]], [[(1,3)]])

In [None]:
def scored_ranges2spans(rnges: Tuple[int, int], scores: int) -> List[Tuple[int, int]]:
    thresh = 0.9
    spans: List[Tuple[int, int]] = []
    active_span_start = None
    active_span_end = None
    for rng, score in zip(rnges, scores):
        if active_span_start is None:
            if score > thresh:
                active_span_start = rng[0]
                active_span_end = rng[1]
        else: # prev. words are already part of span
            if score > thresh:
                active_span_end = rng[1]
            else:
                spans.append((active_span_start, active_span_end))
                active_span_start = None
                active_span_end = None
    if active_span_start is not None:
        spans.append((active_span_start, active_span_end))
    return spans
        

In [None]:
3172 in train_data_preprocessed.keys()

In [None]:
import random
logfile_name = "training_log.txt"
from sklearn.metrics import f1_score

def log(logtext: str = "") -> None:
    print(logtext)
    with open(logfile_name, "a", encoding="utf8") as f:
        f.write(str(logtext) + "\n")
    

def train_model(model: LSTMTokenScorer, criterion, optimizer, scheduler, num_epochs=1):
    since = time.time()
    with TemporaryDirectory() as tempdir:
        best_model_params_path = os.path.join(tempdir, 'best_model_params.pt')

        torch.save(model.state_dict(), best_model_params_path)
        best_acc = 0.0
        best_loss = 9999999999999

        losses = []
        f1s = []
        for epoch in range(num_epochs):
            log(f'Epoch {epoch}/{num_epochs - 1}')
            log('-' * 10)

            # Each epoch has a training and validation phase
            for phase in ['train']: #, 'test'
                if phase == 'train':
                    model.train()
                else: 
                    model.eval()
                
                running_loss = 0.0
                running_loss_average = 0.0
                running_f1_total = 0.0
                running_f1_average = 0.0
                num_non_zero_outputs_in_epoch = 0.0

                # batch = random.choices(list(train_data_preprocessed.values()), k=64)
                data_ids = list(train_data_preprocessed.keys())
                random.shuffle(data_ids)

                # Iterate over data.
                for i, data_id in enumerate(data_ids):
                    datum_preprocessed = train_data_preprocessed[data_id]
                    # zero the parameter gradients
                    optimizer.zero_grad()

                    pn_history_tokens = datum_preprocessed["pn_history_tokens"]
                    scores = datum_preprocessed["scores"]
                    feature_tokens = datum_preprocessed["feature_tokens"]

                    feature_tensor = torch.tensor(np.array([t["embedded"] for t in feature_tokens]), dtype=torch.float)
                    pn_history_tensor = torch.tensor(np.array([t["embedded"] for t in pn_history_tokens]), dtype=torch.float)

                    # track history only in train
                    with torch.set_grad_enabled(phase == 'train'):
                        outputs = model(pn_history_tensor, feature_tensor)
                        
                        num_non_zero_outputs = np.count_nonzero(outputs.detach().numpy().round().astype(int))
                        num_non_zero_outputs_in_epoch += num_non_zero_outputs
                        
                        loss = criterion(outputs.float(), scores.float())

                        # backward + optimize only if in training phase
                        if phase == 'train':
                            loss.backward()
                            optimizer.step()

                    try:
                        f1 = f1_score(scores.int(), outputs.detach().round().int())
                    except Exception as e:
                        log("F1 score calc failed:")
                        log("Scores:")
                        log(scores.int())
                        log("\nOutputs")
                        log(outputs.detach().round().int())
                        log("\n")
                        raise Exception(e)
                    # statistics
                    running_loss += loss.item()
                    running_loss_average = running_loss / (i + 1)
                    losses.append(loss.item())
                    running_f1_total += f1
                    running_f1_average = running_f1_total / (i + 1)
                    f1s.append(f1)
                    
                    if i % 1000 == 0:
                        log(f"Epoch {epoch}, i={i}, avg. loss={running_loss_average}, avg. F1={running_f1_average}, nonzero outputs={num_non_zero_outputs_in_epoch}")
                        # log("LSTM output:")
                        # log(outputs)
                        # log("Truth:")
                        # log(scores)
                        log("Number of nonzero outputs (in sample prediction):")
                        log(num_non_zero_outputs)

                if phase == 'train':
                    scheduler.step()

                epoch_loss = running_loss # / dataset_sizes[phase]
                epoch_f1 = running_f1_average # / dataset_sizes[phase]
                log(f'{phase} Loss: {epoch_loss:.4f} F1: {epoch_f1:.4f} Time elapsed: {round((time.time() - since))} sec.')
                
                # deep copy the model
                if phase == 'test' and epoch_loss < best_loss: #epoch_acc > best_acc:
                    best_acc = epoch_f1
                    best_loss = epoch_loss
                    torch.save(model.state_dict(), best_model_params_path)

            log()

        time_elapsed = time.time() - since
        log(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
        log(f'Best val Acc: {best_acc:4f}')

        # load best model weights
        model.load_state_dict(torch.load(best_model_params_path))
    return model, losses, f1s


In [None]:
data_ids = list(train_data_preprocessed.keys())
neg_values = 0.0
pos_values = 0.0
for i, data_id in tqdm(enumerate(data_ids)):
    datum_preprocessed = train_data_preprocessed[data_id]

    pn_history_tokens = datum_preprocessed["pn_history_tokens"]
    scores = datum_preprocessed["scores"]
    pos_values += np.count_nonzero(scores.numpy().round().astype(int))
    neg_values += (scores.shape[0] - np.count_nonzero(scores.numpy().round().astype(int)))

neg_pos_ratio = neg_values / pos_values
neg_pos_ratio

In [None]:
from itertools import product

open(logfile_name, "w", encoding="utf8") # clear logs

for lr in [0.001]:
    # make model with vocab sizes, including placeholder indices
    model = LSTMTokenScorer(EMBEDDING_DIM, HIDDEN_DIM)
    # loss_function = nn.BCELoss()
    pos_weight = torch.full((1,), neg_pos_ratio)
    loss_function = nn.BCEWithLogitsLoss(pos_weight=pos_weight)#make positive class more valuable

    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
    
    log(f"Starting model training for lr={lr}...")
    model, losses, f1s = train_model(model, loss_function, optimizer, exp_lr_scheduler, num_epochs=10)