In [1]:
# CNN
import torch.nn.functional as F
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn

# others
import numpy as np
import matplotlib.pyplot as plt
import time
import os
from PIL import Image
from tempfile import TemporaryDirectory
import time

# dataset
import torchvision
from torchvision import datasets, models, transforms
from torchvision.datasets import Flowers102
from sklearn.metrics import f1_score


# read file 
import pandas as pd

# label
from scipy.io import loadmat
import json
from tqdm import tqdm
from itertools import islice
from typing import List, Dict, Tuple

# Use TweetEval emotion recognition dataset 

In [2]:
# root = '../../Data/tweeteval/datasets/emotion/'
# mapping_file = os.path.join(root, 'mapping.txt')
# test_labels_file = os.path.join(root, 'test_labels.txt')
# test_text_file = os.path.join(root, 'test_text.txt')
# train_labels_file = os.path.join(root, 'train_labels.txt')
# train_text_file = os.path.join(root, 'train_text.txt')
# val_labels_file = os.path.join(root, 'val_labels.txt')
# val_text_file = os.path.join(root, 'val_text.txt')

In [3]:
# mapping_pd = pd.read_csv(mapping_file, sep='\t', header=None)
# test_label_pd = pd.read_csv(test_labels_file, sep='\t', header=None)
# test_dataset = open(test_text_file).read().split('\n')[:-1] # remove last empty line 
# train_label_pd = pd.read_csv(train_labels_file, sep='\t', header=None)
# train_dataset = open(train_text_file).read().split('\n')[:-1] # remove last empty line
# val_label_pd = pd.read_csv(val_labels_file, sep='\t', header=None)
# val_dataset = open(val_text_file).read().split('\n')[:-1] # remove last empty line

# Preprocess training data
- Given: Notes with ranges and labels
- Transform into label + lists of tokens with [does token describe label]

In [4]:
root = './data/'
features_path = os.path.join(root, 'features.csv')
patient_notes_path = os.path.join(root, 'patient_notes.csv')
sample_submission_path = os.path.join(root, 'sample_submission.csv')
test_path = os.path.join(root, 'test.csv')
train_path = os.path.join(root, 'train.csv')
features = pd.read_csv(features_path, sep=',', header=0)
patient_notes = pd.read_csv(patient_notes_path, sep=',', header=0)
train_raw = pd.read_csv(train_path, sep=',', header=0)


In [5]:
# unusual_numbers = features["feature_num"].value_counts()[features["feature_num"].value_counts() != 1]
# unusual_numbers
features[features["feature_text"] == "Female"]
# features["feature_num"] == 

Unnamed: 0,feature_num,case_num,feature_text
25,112,1,Female
34,208,2,Female
66,407,4,Female
70,501,5,Female
99,700,7,Female
110,802,8,Female
139,913,9,Female


## intro 
- `case_num`: 0~9, each num belongs their groups ... ? 
- `pn_num`: the id in patient_notes.csv which is 'pn_history', present the note of each case 
- `feature_num`: the id in features.csv which is 'feature_num', present the feature of each case 
- `location`: 

In [6]:
import re
def df_string2list_of_ints(df_string: str):
    df_string = df_string.strip("[]")
    if df_string == "":
        return []
    entries = re.split(",|;", df_string)
    entries = [entry.strip(" '") for entry in entries]
    ranges = [tuple(int(num_as_str) for num_as_str in entry.split(" ")) for entry in entries]
    return ranges

In [7]:
train_raw

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724']
1,00016_001,0,16,1,"['mom with ""thyroid disease']",['668 693']
2,00016_002,0,16,2,['chest pressure'],['203 217']
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","['70 91', '176 183']"
4,00016_004,0,16,4,['felt as if he were going to pass out'],['222 258']
...,...,...,...,...,...,...
14295,95333_912,9,95333,912,[],[]
14296,95333_913,9,95333,913,[],[]
14297,95333_914,9,95333,914,['photobia'],['274 282']
14298,95333_915,9,95333,915,['no sick contacts'],['421 437']


In [8]:
data_merged = train_raw.merge(features, on=['feature_num', 'case_num'], how='left')
data_merged = data_merged.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
data_merged["location"] = data_merged["location"].apply(df_string2list_of_ints)
data_merged.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history
0,00016_000,0,16,0,['dad with recent heart attcak'],"[(696, 724)]",Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...
1,00016_001,0,16,1,"['mom with ""thyroid disease']","[(668, 693)]",Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...
2,00016_002,0,16,2,['chest pressure'],"[(203, 217)]",Chest-pressure,HPI: 17yo M presents with palpitations. Patien...
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","[(70, 91), (176, 183)]",Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...
4,00016_004,0,16,4,['felt as if he were going to pass out'],"[(222, 258)]",Lightheaded,HPI: 17yo M presents with palpitations. Patien...


In [9]:
train = data_merged[["feature_text", "pn_history", "location", ]]
train.head()

Unnamed: 0,feature_text,pn_history,location
0,Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...,"[(696, 724)]"
1,Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...,"[(668, 693)]"
2,Chest-pressure,HPI: 17yo M presents with palpitations. Patien...,"[(203, 217)]"
3,Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...,"[(70, 91), (176, 183)]"
4,Lightheaded,HPI: 17yo M presents with palpitations. Patien...,"[(222, 258)]"


In [10]:
# filter training data with no location
train = train[train["location"].apply(lambda row: len(row) != 0)]

In [11]:
print(f'Size of dataset= {len(train)}')

Size of dataset= 9901


## Tokenization
- Use spaCy to split the notes into words.

Before start using spaCy
```
conda install -c conda-forge spacy
python -m spacy download en_core_web_sm
```

In [12]:
import spacy 
from collections import Counter

# use spacy to tokenize the sentence with english model 
nlp = spacy.load("en_core_web_sm")


In [13]:
from typing import List, Iterable

def build_vocab_from_lines(lines: Iterable[str]):
    text_to_count_tokens = ' '.join(lines)
    doc = nlp(text_to_count_tokens)
    # Get the most frequent words, filtering out stop words and punctuation.
    word_freq = Counter(token.text.lower() for token in doc if \
                        not token.is_punct and \
                            not token.is_stop and \
                                not token.is_space)
    return word_freq.most_common()

In [14]:
from os.path import join as pathjoin
import pickle
cache_dir = "cache"

In [15]:
from typing import Dict, List

cache_file = pathjoin(cache_dir, "tokenized_pn_histories.pkl")
if os.path.isfile(cache_file):
    with open(cache_file, "rb") as f:
        tokenized_pn_histories = pickle.load(f)
    print("Tokenized patient histories loaded from cache.")
else:
    print("Found no cached tokenized patient histories. Tokenizing...")
    tokenized_pn_histories: Dict[str, List[str]] = {}
    for pn_history in tqdm(train["pn_history"]):
        indexed_words = []
        if pn_history in tokenized_pn_histories:
            continue
        for token in nlp(pn_history):
            if not token.is_punct and not token.is_stop and not token.is_space:
                word = token.text.lower()
                start_idx = token.idx
                end_idx = token.idx + len(token.text)

                indexed_words.append({
                    "word": word,
                    "start": start_idx,
                    "end": end_idx
                })
                    
        tokenized_pn_histories[pn_history] = indexed_words
    with open(cache_file, "wb") as f:
        pickle.dump(tokenized_pn_histories, f)


Found no cached tokenized patient histories. Tokenizing...


100%|██████████| 9901/9901 [00:48<00:00, 204.94it/s]


In [16]:
from typing import Dict, List

cache_file = pathjoin(cache_dir, "tokenized_features.pkl")
if os.path.isfile(cache_file):
    with open(cache_file, "rb") as f:
        tokenized_features = pickle.load(f)
    print("Tokenized features loaded from cache.")
else:
    print("Found no cached tokenized features. Tokenizing...")
    tokenized_features: Dict[str, List[str]] = {}
    for feature_text in tqdm(train["feature_text"]):
        indexed_words = []
        if feature_text in tokenized_features:
            continue
        for token in nlp(feature_text):
            if not token.is_punct and not token.is_stop and not token.is_space:
                word = token.text.lower()
                start_idx = token.idx
                end_idx = token.idx + len(token.text)

                indexed_words.append({
                    "word": word,
                    "start": start_idx,
                    "end": end_idx
                })
                
        tokenized_features[feature_text] = indexed_words
    with open(cache_file, "wb") as f:
        pickle.dump(tokenized_features, f)


Found no cached tokenized features. Tokenizing...


100%|██████████| 9901/9901 [00:02<00:00, 4810.98it/s]


- Follow the example described here. Use the same architecture, but:
  - only use the last output of the LSTM in the loss function
  - use an embedding dim of 128
  - use a hidden dim of 256.  

## Get feature-relevancy of tokens via char ranges

In [25]:
train_data_preprocessed = dict()
for i, (feature_text, pn_history, location) in tqdm(train.iterrows()):
    tokenized_history = tokenized_pn_histories[pn_history]
    tokens_with_scores = []
    for token in tokenized_history:
        for feature_relevant_range in location:
            token_start, token_end = token["start"], token["end"]
            range_start, range_end = feature_relevant_range[0], feature_relevant_range[1]
            
            percentage_of_token_in_range = max(min(token_end, range_end)+1 - max(token_start, range_start), 0) / (token_end+1 - token_start)
            # if percentage_of_token_in_range > 0:
            #     print(percentage_of_token_in_range, token, feature_relevant_range)
            tokens_with_scores.append({"token": token, "score": int(percentage_of_token_in_range > 0.9)})
    
    train_data_preprocessed[i] = {
                                    "pn_history_tokens": [ts["token"] for ts in tokens_with_scores],
                                    "scores": torch.tensor([ts["score"] for ts in tokens_with_scores]),
                                    "feature_tokens": tokenized_features[feature_text],
                                    "locations": location
                                   }
        

0it [00:00, ?it/s]

9901it [00:05, 1753.05it/s]


In [26]:
train_data_preprocessed[0]

{'pn_history_tokens': [{'word': 'hpi', 'start': 0, 'end': 3},
  {'word': '17yo', 'start': 5, 'end': 9},
  {'word': 'm', 'start': 10, 'end': 11},
  {'word': 'presents', 'start': 12, 'end': 20},
  {'word': 'palpitations', 'start': 26, 'end': 38},
  {'word': 'patient', 'start': 40, 'end': 47},
  {'word': 'reports', 'start': 48, 'end': 55},
  {'word': '3', 'start': 56, 'end': 57},
  {'word': '4', 'start': 58, 'end': 59},
  {'word': 'months', 'start': 60, 'end': 66},
  {'word': 'intermittent', 'start': 70, 'end': 82},
  {'word': 'episodes', 'start': 83, 'end': 91},
  {'word': 'heart', 'start': 96, 'end': 101},
  {'word': 'beating', 'start': 102, 'end': 109},
  {'word': 'pounding', 'start': 110, 'end': 118},
  {'word': 'chest', 'start': 129, 'end': 134},
  {'word': '2', 'start': 137, 'end': 138},
  {'word': 'days', 'start': 139, 'end': 143},
  {'word': 'ago', 'start': 144, 'end': 147},
  {'word': 'soccer', 'start': 157, 'end': 163},
  {'word': 'game', 'start': 164, 'end': 168},
  {'word': 'e

In [27]:
num_no_positives = sum([1 for dp in train_data_preprocessed.values() if sum(dp["scores"]) == 0])
print(f"filtering {num_no_positives} out of {len(train_data_preprocessed)} datapoints because they don't contain any positive scores.")
train_data_preprocessed = {key: dp for key, dp in train_data_preprocessed.items() if sum(dp["scores"]) != 0}

filtering 77 out of 9901 datapoints because they don't contain any positive scores.


# Structure of the Model
Layers in LSTM Model:
1. embed feature tokens
2. lstm feature -> constant size vector

3. pass to 2nd lstm
	

In [29]:
EMBEDDING_DIM = 128
HIDDEN_DIM = 256

In [None]:
class LSTMTokenScorer(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, pn_hist_vocab_size, feature_vocab_size, dropout=0.0):
        super(LSTMTokenScorer, self).__init__()

        self.pn_history_hidden_dim = hidden_dim

        self.feature_embeddings = nn.Embedding(feature_vocab_size, embedding_dim)
        self.feature_lstm = nn.LSTM(embedding_dim, embedding_dim, dropout=dropout) # the feature is now one tensor of size [embedding_dim].

        self.pn_history_embeddings = nn.Embedding(pn_hist_vocab_size, embedding_dim)
        
        self.total_lstm = nn.LSTM(embedding_dim * 2, self.pn_history_hidden_dim, dropout=dropout)
        
        self.hidden2score = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()


    def forward(self, pn_history, feature):
        feature_embeds = self.feature_embeddings(feature)
        feature_lstm_out, _ = self.feature_lstm(feature_embeds.view(len(feature), 1, -1)) # the feature is now one tensor of size [embedding_dim].
        feature_reduced = torch.squeeze(feature_lstm_out[-1]) #.view(1, -1)
        feature_multiplied = feature_reduced.repeat((len(pn_history), 1)) # duplicate feature vector to be same size as embedded pn_history vector.

        pn_history_embeds = self.pn_history_embeddings(pn_history)
        pn_history_and_features = torch.concat((feature_multiplied, pn_history_embeds), dim=1)

        pn_history_reduced, _ = self.total_lstm(pn_history_and_features)
        pred_score_raw = torch.squeeze(self.hidden2score(pn_history_reduced))
        pred_score = self.sigmoid(pred_score_raw)
        return pred_score

In [None]:
all_scores = [d["scores"].numpy() for d in train_data_preprocessed.values()]
avg_neg_div_pos = np.mean([(scores.shape[0] - np.sum(scores)) / np.sum(scores) for scores in all_scores])

In [None]:
# make model with vocab sizes, including placeholder indices
model = LSTMTokenScorer(EMBEDDING_DIM, HIDDEN_DIM, len(pn_history_vocab)+1, len(feature_vocab)+1)
loss_function = nn.BCELoss()
# loss_function = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(avg_neg_div_pos))
# loss_function = lambda pred, target, vec_size: nn.functional.binary_cross_entropy_with_logits(pred.float(), target.float(), pos_weight=torch.full((vec_size,), avg_neg_div_pos))
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

In [None]:
feature_tokens = train_data_preprocessed[0]["feature_tokens"]
pn_history_tokens = train_data_preprocessed[0]["pn_history_tokens"]

model(pn_history_tokens, feature_tokens)
# model()

In [None]:
# def one_hot_encode(val):
#     if val == 0:
#         return torch.tensor([1, 0], dtype=torch.float)
#     elif val == 1:
#         return torch.tensor([0, 1], dtype=torch.float)
#     raise Exception("one hot encode got invalid value.")

In [None]:
# From https://www.kaggle.com/theoviel/evaluation-metric-folds-baseline

def micro_f1(preds, truths):
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    return f1_score(truths, preds)


def spans_to_binary(spans, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
    return binary


def span_micro_f1(preds: List[List[Tuple[int, int]]], truths: List[List[Tuple[int, int]]]):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
    return micro_f1(bin_preds, bin_truths)


In [None]:
span_micro_f1([[(1,3)]], [[(1,3)]])

In [None]:
import random
logfile_name = "training_log.txt"

def log(logtext: str = "") -> None:
    print(logtext)
    with open(logfile_name, "a", encoding="utf8") as f:
        f.write(str(logtext) + "\n")
    

def train_model(model: LSTMTokenScorer, criterion, optimizer, scheduler, num_epochs=1):
    since = time.time()
    with TemporaryDirectory() as tempdir:
        best_model_params_path = os.path.join(tempdir, 'best_model_params.pt')

        torch.save(model.state_dict(), best_model_params_path)
        best_acc = 0.0
        best_loss = 9999999999999

        for epoch in range(num_epochs):
            log(f'Epoch {epoch}/{num_epochs - 1}')
            log('-' * 10)

            # Each epoch has a training and validation phase
            for phase in ['train']: #, 'test'
                if phase == 'train':
                    model.train()
                else: 
                    model.eval()
                
                running_loss = 0.0
                running_corrects = 0

                batch = random.choices(list(train_data_preprocessed.values()), k=64)

                # Iterate over data.
                for i, datum_preprocessed in enumerate(batch):
                    # zero the parameter gradients
                    optimizer.zero_grad()

                    pn_history_tokens = datum_preprocessed["pn_history_tokens"]
                    scores = datum_preprocessed["scores"]
                    feature_tokens = datum_preprocessed["feature_tokens"]

                    # track history only in train
                    with torch.set_grad_enabled(phase == 'train'):
                        outputs = model(pn_history_tokens, feature_tokens)
                        loss = criterion(outputs.float(), scores.float())

                        pred = (outputs > 0.9).int()
                        # backward + optimize only if in training phase
                        if phase == 'train':
                            loss.backward()
                            optimizer.step()

                    # statistics
                    running_loss += loss.item()
                    if torch.equal(pred, scores):
                        running_corrects += 1
                    
                    if i == len(batch) - 1 and epoch % 20 == 19:
                        log("LSTM output:")
                        log(outputs)
                        log("Truth:")
                        log(scores)

                if phase == 'train':
                    scheduler.step()

                epoch_loss = running_loss # / dataset_sizes[phase]
                epoch_acc = running_corrects # / dataset_sizes[phase]
                log(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f} Time elapsed: {round((time.time() - since))} sec.')
                
                # deep copy the model
                if phase == 'test' and epoch_loss < best_loss: #epoch_acc > best_acc:
                    best_acc = epoch_acc
                    best_loss = epoch_loss
                    torch.save(model.state_dict(), best_model_params_path)

            log()

        time_elapsed = time.time() - since
        log(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
        log(f'Best val Acc: {best_acc:4f}')

        # load best model weights
        model.load_state_dict(torch.load(best_model_params_path))
    return model

open(logfile_name, "w", encoding="utf8")
    
model = train_model(model, loss_function, optimizer, exp_lr_scheduler, num_epochs=100)

In [None]:
a = torch.tensor([0, 0, 0], dtype=torch.float)
b = torch.tensor([0, 0, 1], dtype=torch.float)
c = torch.tensor([1, 1, 0], dtype=torch.float)
d = torch.tensor([0, 1, 0], dtype=torch.float)

nn.functional.binary_cross_entropy_with_logits(a, b, pos_weight=torch.full((3,), 2))