In [1]:
# CNN
import torch.nn.functional as F
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn

# others
import numpy as np
import matplotlib.pyplot as plt
import time
import os
from PIL import Image
from tempfile import TemporaryDirectory
import time

# dataset
import torchvision
from torchvision import datasets, models, transforms
from torchvision.datasets import Flowers102

# read file 
import pandas as pd

# label
from scipy.io import loadmat
import json
from tqdm import tqdm
from itertools import islice

# Use TweetEval emotion recognition dataset 

In [2]:
# root = '../../Data/tweeteval/datasets/emotion/'
# mapping_file = os.path.join(root, 'mapping.txt')
# test_labels_file = os.path.join(root, 'test_labels.txt')
# test_text_file = os.path.join(root, 'test_text.txt')
# train_labels_file = os.path.join(root, 'train_labels.txt')
# train_text_file = os.path.join(root, 'train_text.txt')
# val_labels_file = os.path.join(root, 'val_labels.txt')
# val_text_file = os.path.join(root, 'val_text.txt')

In [3]:
# mapping_pd = pd.read_csv(mapping_file, sep='\t', header=None)
# test_label_pd = pd.read_csv(test_labels_file, sep='\t', header=None)
# test_dataset = open(test_text_file).read().split('\n')[:-1] # remove last empty line 
# train_label_pd = pd.read_csv(train_labels_file, sep='\t', header=None)
# train_dataset = open(train_text_file).read().split('\n')[:-1] # remove last empty line
# val_label_pd = pd.read_csv(val_labels_file, sep='\t', header=None)
# val_dataset = open(val_text_file).read().split('\n')[:-1] # remove last empty line

# Preprocess training data
- Given: Notes with ranges and labels
- Transform into label + lists of tokens with [does token describe label]

In [4]:
root = './data/'
features_path = os.path.join(root, 'features.csv')
patient_notes_path = os.path.join(root, 'patient_notes.csv')
sample_submission_path = os.path.join(root, 'sample_submission.csv')
test_path = os.path.join(root, 'test.csv')
train_path = os.path.join(root, 'train.csv')
features = pd.read_csv(features_path, sep=',', header=0)
patient_notes = pd.read_csv(patient_notes_path, sep=',', header=0)
train_raw = pd.read_csv(train_path, sep=',', header=0)


In [5]:
# unusual_numbers = features["feature_num"].value_counts()[features["feature_num"].value_counts() != 1]
# unusual_numbers
features[features["feature_text"] == "Female"]
# features["feature_num"] == 

Unnamed: 0,feature_num,case_num,feature_text
25,112,1,Female
34,208,2,Female
66,407,4,Female
70,501,5,Female
99,700,7,Female
110,802,8,Female
139,913,9,Female


## intro 
- `case_num`: 0~9, each num belongs their groups ... ? 
- `pn_num`: the id in patient_notes.csv which is 'pn_history', present the note of each case 
- `feature_num`: the id in features.csv which is 'feature_num', present the feature of each case 
- `location`: 

In [6]:
import re
def df_string2list_of_ints(df_string: str):
    df_string = df_string.strip("[]")
    if df_string == "":
        return []
    entries = re.split(",|;", df_string)
    entries = [entry.strip(" '") for entry in entries]
    ranges = [tuple(int(num_as_str) for num_as_str in entry.split(" ")) for entry in entries]
    return ranges

In [7]:
train_raw

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724']
1,00016_001,0,16,1,"['mom with ""thyroid disease']",['668 693']
2,00016_002,0,16,2,['chest pressure'],['203 217']
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","['70 91', '176 183']"
4,00016_004,0,16,4,['felt as if he were going to pass out'],['222 258']
...,...,...,...,...,...,...
14295,95333_912,9,95333,912,[],[]
14296,95333_913,9,95333,913,[],[]
14297,95333_914,9,95333,914,['photobia'],['274 282']
14298,95333_915,9,95333,915,['no sick contacts'],['421 437']


In [8]:
data_merged = train_raw.merge(features, on=['feature_num', 'case_num'], how='left')
data_merged = data_merged.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
data_merged["location"] = data_merged["location"].apply(df_string2list_of_ints)
data_merged.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history
0,00016_000,0,16,0,['dad with recent heart attcak'],"[(696, 724)]",Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...
1,00016_001,0,16,1,"['mom with ""thyroid disease']","[(668, 693)]",Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...
2,00016_002,0,16,2,['chest pressure'],"[(203, 217)]",Chest-pressure,HPI: 17yo M presents with palpitations. Patien...
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","[(70, 91), (176, 183)]",Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...
4,00016_004,0,16,4,['felt as if he were going to pass out'],"[(222, 258)]",Lightheaded,HPI: 17yo M presents with palpitations. Patien...


In [9]:
train = data_merged[["feature_text", "pn_history", "location", ]]
train.head()

Unnamed: 0,feature_text,pn_history,location
0,Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...,"[(696, 724)]"
1,Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...,"[(668, 693)]"
2,Chest-pressure,HPI: 17yo M presents with palpitations. Patien...,"[(203, 217)]"
3,Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...,"[(70, 91), (176, 183)]"
4,Lightheaded,HPI: 17yo M presents with palpitations. Patien...,"[(222, 258)]"


In [10]:
# filter training data with no location
train = train[train["location"].apply(lambda row: len(row) != 0)]

In [11]:
print(f'Size of dataset= {len(train)}')

Size of dataset= 9901


## Tokenization
- Use spaCy to split the notes into words.

Before start using spaCy
```
conda install -c conda-forge spacy
python -m spacy download en_core_web_sm
```

In [12]:
import spacy 
from collections import Counter

# use spacy to tokenize the sentence with english model 
nlp = spacy.load("en_core_web_sm")


In [13]:
from typing import List, Iterable

def build_vocab_from_lines(lines: Iterable[str]):
    text_to_count_tokens = ' '.join(lines)
    doc = nlp(text_to_count_tokens)
    # Get the most frequent words, filtering out stop words and punctuation.
    word_freq = Counter(token.text.lower() for token in doc if \
                        not token.is_punct and \
                            not token.is_stop and \
                                not token.is_space)
    return word_freq.most_common()

In [14]:
# Create vocabulary by getting the most common words across (unique) patient histories
import pickle
import os
from os.path import join as pathjoin

cache_dir = "cache"
cache_file = pathjoin(cache_dir, "vocab.pkl")
if os.path.isfile(cache_file):
    with open(cache_file, "rb") as f:
        pn_history_vocab = pickle.load(f)
    print("Vocabulary loaded from cache.")
else:
    print("Found no cached vocabulary. Creating...")
    
    most_common_words = build_vocab_from_lines(train["pn_history"].drop_duplicates())[:5000]
    
    pn_history_vocab = {word[0]: idx for idx, word in enumerate(most_common_words)}
    with open(cache_file, "wb") as f:
        pickle.dump(pn_history_vocab, f)

print("Top 10 words: ", ", ".join(list(pn_history_vocab)[:10]))
# [(k, v) for k, v in vocab.items() if v == 0]

Found no cached vocabulary. Creating...
Top 10 words:  pain, 2, denies, ago, 3, pmh, months, changes, 4, use


In [15]:
# Create vocabulary by getting the most common words across (unique) patient histories
import pickle
import os
from os.path import join as pathjoin

cache_dir = "cache"
cache_file = pathjoin(cache_dir, "feature_vocab.pkl")
if os.path.isfile(cache_file):
    with open(cache_file, "rb") as f:
        feature_vocab = pickle.load(f)
    print("Feature vocabulary loaded from cache.")
else:
    print("Found no cached feature vocabulary. Creating...")
    
    most_common_words = build_vocab_from_lines(train["feature_text"].drop_duplicates())[:5000]
    
    feature_vocab = {word[0]: idx for idx, word in enumerate(most_common_words)}
    with open(cache_file, "wb") as f:
        pickle.dump(feature_vocab, f)

print("Top 10 words: ", ", ".join(list(feature_vocab)[:10]))
# [(k, v) for k, v in vocab.items() if v == 0]

Found no cached feature vocabulary. Creating...
Top 10 words:  symptoms, ago, year, history, use, months, pain, family, recent, irregular


In [16]:
from typing import Dict, List

placeholder_index = 5000

cache_file = pathjoin(cache_dir, "tokenized_pn_histories.pkl")
if os.path.isfile(cache_file):
    with open(cache_file, "rb") as f:
        tokenized_pn_histories = pickle.load(f)
    print("Tokenized patient histories loaded from cache.")
else:
    print("Found no cached tokenized patient histories. Tokenizing...")
    tokenized_pn_histories: Dict[str, List[str]] = {}
    for pn_history in tqdm(train["pn_history"]):
        indexed_words = []
        if pn_history in tokenized_pn_histories:
            continue
        for token in nlp(pn_history):
            if not token.is_punct and not token.is_stop and not token.is_space:
                word = token.text.lower()
                start_idx = token.idx
                end_idx = token.idx + len(token.text)

                word_as_number = pn_history_vocab[word] if word in pn_history_vocab else placeholder_index
                
                indexed_words.append({
                    "word_idx": word_as_number,
                    "start": start_idx,
                    "end": end_idx
                })
                    
        tokenized_pn_histories[pn_history] = indexed_words
    with open(cache_file, "wb") as f:
        pickle.dump(tokenized_pn_histories, f)


Found no cached tokenized patient histories. Tokenizing...


  0%|          | 0/9901 [00:00<?, ?it/s]

100%|██████████| 9901/9901 [00:25<00:00, 387.39it/s]


In [23]:
from typing import Dict, List

placeholder_index = len(feature_vocab)

cache_file = pathjoin(cache_dir, "tokenized_features.pkl")
if os.path.isfile(cache_file):
    with open(cache_file, "rb") as f:
        tokenized_features = pickle.load(f)
    print("Tokenized features loaded from cache.")
else:
    print("Found no cached tokenized features. Tokenizing...")
    tokenized_features: Dict[str, List[str]] = {}
    for feature_text in tqdm(train["feature_text"]):
        indexed_words = []
        if feature_text in tokenized_features:
            continue
        for token in nlp(feature_text):
            if not token.is_punct and not token.is_stop and not token.is_space:
                word = token.text.lower()
                word_as_number = feature_vocab[word] if word in feature_vocab else placeholder_index
                
                indexed_words.append(word_as_number)
                    
        tokenized_features[feature_text] = indexed_words
    with open(cache_file, "wb") as f:
        pickle.dump(tokenized_features, f)


Found no cached tokenized features. Tokenizing...


100%|██████████| 9901/9901 [00:00<00:00, 10313.41it/s]


In [24]:
tokenized_features

{'Family-history-of-MI-OR-Family-history-of-myocardial-infarction': [7,
  3,
  70,
  7,
  3,
  71,
  72],
 'Family-history-of-thyroid-disorder': [7, 3, 31, 73],
 'Chest-pressure': [15, 74],
 'Intermittent-symptoms': [32, 0],
 'Lightheaded': [75],
 'Adderall-use': [76, 4],
 'heart-pounding-OR-heart-racing': [16, 77, 16, 33],
 'Few-months-duration': [5, 11],
 '17-year': [78, 2],
 'Male': [79],
 'Shortness-of-breath': [17, 12],
 'No-hair-changes-OR-no-nail-changes-OR-no-temperature-intolerance': [80,
  34,
  81,
  34,
  82,
  83],
 'Caffeine-use': [18, 4],
 'No-vaginal-discharge': [35, 84],
 'Not-sexually-active': [36, 37],
 '20-year': [85, 2],
 'Recurrent-bouts-over-past-6-months': [86, 87, 88, 5],
 'Right-sided-LQ-abdominal-pain-OR-Right-lower-quadrant-abdominal-pain': [38,
  89,
  90,
  39,
  6,
  38,
  91,
  92,
  39,
  6],
 'No-urinary-symptoms': [93, 0],
 'Normal-LMP-2-weeks-ago-OR-Normal-last-menstrual-period-2-weeks-ago': [19,
  40,
  13,
  1,
  19,
  20,
  21,
  13,
  1],
 '8-to-

- Follow the example described here. Use the same architecture, but:
  - only use the last output of the LSTM in the loss function
  - use an embedding dim of 128
  - use a hidden dim of 256.  

## Get feature-relevancy of tokens via char ranges

In [25]:
train_data_preprocessed = dict()
for i, (feature_text, pn_history, location) in tqdm(train.iterrows()):
    tokenized_history = tokenized_pn_histories[pn_history]
    tokens_with_scores = []
    for token in tokenized_history:
        for feature_relevant_range in location:
            token_start, token_end = token["start"], token["end"]
            range_start, range_end = feature_relevant_range[0], feature_relevant_range[1]
            
            percentage_of_token_in_range = max(min(token_end, range_end)+1 - max(token_start, range_start), 0) / (token_end+1 - token_start)
            # if percentage_of_token_in_range > 0:
            #     print(percentage_of_token_in_range, token, feature_relevant_range)
            tokens_with_scores.append({"word": token["word_idx"], "score": int(percentage_of_token_in_range > 0.9)})
    
    train_data_preprocessed[i] = {
                                    "scored_tokens": tokens_with_scores,
                                    "feature_tokens": tokenized_features[feature_text]
                                   }
        

9901it [00:03, 2538.07it/s]


In [26]:
# print("data format:")
# train_data_preprocessed[0].keys()
train_data_preprocessed[0]["feature_tokens"]

[7, 3, 70, 7, 3, 71, 72]

# TODO Bring feature label into training data for LSTM!
- must encode the feature text into the LSTM input data to train. How to do it?
- 2 vocabs

Layers in LSTM Model:
1. embed feature tokens
2. lstm feature -> constant size vector

3. pass to 2nd lstm
	

In [27]:
EMBEDDING_DIM = 128
HIDDEN_DIM = 256


In [59]:
class LSTMTokenScorer(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, pn_hist_vocab_size, feature_vocab_size, dropout=0.0):
        super(LSTMTokenScorer, self).__init__()

        self.pn_history_hidden_dim = hidden_dim

        self.feature_embeddings = nn.Embedding(feature_vocab_size, embedding_dim)
        self.feature_lstm = nn.LSTM(embedding_dim, embedding_dim, dropout=dropout) # the feature is now one tensor of size [embedding_dim].

        self.pn_history_embeddings = nn.Embedding(pn_hist_vocab_size, embedding_dim)
        
        self.total_lstm = nn.LSTM(embedding_dim * 2, self.pn_history_hidden_dim, dropout=dropout)
        
        self.hidden2score = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()



    def forward(self, pn_history, feature):
        feature_embeds = self.feature_embeddings(feature)
        feature_lstm_out, _ = self.feature_lstm(feature_embeds.view(len(feature), 1, -1)) # the feature is now one tensor of size [embedding_dim].
        feature_reduced = feature_lstm_out[-1][0] #.view(1, -1)
        
        feature_multiplied = feature_reduced.repeat((len(pn_history), 1)) # duplicate feature vector to be same size as embedded pn_history vector.

        pn_history_embeds = self.pn_history_embeddings(pn_history)
        pn_history_and_features = torch.concat((feature_multiplied, pn_history_embeds), dim=1)
        print(pn_history_and_features.size())

        pn_history_reduced, _ = self.total_lstm(pn_history_and_features)
        print(pn_history_reduced.size())
        pred_score_raw = torch.squeeze(self.hidden2score(pn_history_reduced))
        pred_score = self.sigmoid(pred_score_raw)
        print(pred_score.size())
        return pred_score

# make model with vocab sizes, including placeholder indices
model = LSTMTokenScorer(EMBEDDING_DIM, HIDDEN_DIM, len(pn_history_vocab)+1, len(feature_vocab)+1)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)


feature_tokens = train_data_preprocessed[0]["feature_tokens"]
scored_tokens = train_data_preprocessed[0]["scored_tokens"]
pn_history_tokens = [st["word"] for st in scored_tokens]
feature_tokens = torch.tensor(feature_tokens)
pn_history_tokens = torch.tensor(pn_history_tokens)
feature_tokens

model(pn_history_tokens, feature_tokens)
# model()

torch.Size([107, 256])
torch.Size([107, 256])
torch.Size([107])


tensor([0.5014, 0.5105, 0.5071, 0.5105, 0.5323, 0.5162, 0.4995, 0.5367, 0.5266,
        0.5095, 0.5028, 0.4818, 0.4828, 0.5033, 0.5186, 0.5095, 0.5109, 0.5034,
        0.5127, 0.5118, 0.4902, 0.4751, 0.4797, 0.4935, 0.5050, 0.5163, 0.5069,
        0.5152, 0.5110, 0.5332, 0.5232, 0.5181, 0.5392, 0.5083, 0.5222, 0.5075,
        0.4836, 0.4921, 0.5332, 0.4921, 0.4878, 0.5184, 0.5092, 0.4917, 0.4806,
        0.4667, 0.4855, 0.4982, 0.4869, 0.4797, 0.4967, 0.5022, 0.4671, 0.4767,
        0.4784, 0.5073, 0.5139, 0.5001, 0.4961, 0.4930, 0.4729, 0.4856, 0.5171,
        0.5032, 0.4991, 0.5205, 0.4938, 0.5067, 0.5076, 0.5240, 0.5043, 0.4922,
        0.4686, 0.4819, 0.4888, 0.4945, 0.4631, 0.4911, 0.5203, 0.5070, 0.5049,
        0.5200, 0.4939, 0.4839, 0.4629, 0.5073, 0.5268, 0.5432, 0.5292, 0.4912,
        0.5311, 0.5200, 0.4951, 0.4980, 0.4738, 0.4842, 0.5143, 0.4822, 0.5070,
        0.4914, 0.4882, 0.5058, 0.4945, 0.4993, 0.5078, 0.5015, 0.4955],
       grad_fn=<SigmoidBackward0>)

In [47]:
x = torch.tensor([1, 2, 3])
x.repeat((2, 1))

tensor([[1, 2, 3],
        [1, 2, 3]])

In [None]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=1):
    since = time.time()
    with TemporaryDirectory() as tempdir:
        best_model_params_path = os.path.join(tempdir, 'best_model_params.pt')

        torch.save(model.state_dict(), best_model_params_path)
        best_acc = 0.0

        for epoch in range(num_epochs):
            print(f'Epoch {epoch}/{num_epochs - 1}')
            print('-' * 10)

            # Each epoch has a training and validation phase
            for phase in ['train', 'test']:
                if phase == 'train':
                    model.train()
                else: 
                    model.eval()
                
                running_loss = 0.0
                running_corrects = 0

                # Iterate over data.
                for input, label in zip(dataloaders[phase], resultloaders[phase]):
                    inputs_vector = prepare_sentence_sequence(input, word_to_ix)
                    labels_vector = one_hot_encode(label, tag_to_ix)
                    
                    # zero the parameter gradients
                    optimizer.zero_grad()

                    # forward
                    # track history if only in train
                    with torch.set_grad_enabled(phase == 'train'):
                        outputs = model(inputs_vector) # 取得針對每個emotion的預測結果tensor (e.g. tensor([[-1.3948, -1.4476, -1.3804, -1.3261]]))
                        pred = torch.argmax(outputs).item() # 取得最大值的index (e.g. 2)
                        loss = criterion(outputs[0], labels_vector) # 外面還有一層，只需取得內層 [-1.3948, -1.4476, -1.3804, -1.3261] 與 [0, 0, 1, 0] 的計算loss

                        # backward + optimize only if in training phase
                        if phase == 'train':
                            loss.backward()
                            optimizer.step()

                    # statistics
                    running_loss += loss.item()
                    if pred == label:
                        running_corrects += 1

                if phase == 'train':
                    scheduler.step()

                epoch_loss = running_loss / dataset_sizes[phase]
                epoch_acc = running_corrects / dataset_sizes[phase]
                print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f} Time elapsed: {round((time.time() - since))} sec.')
                
                # deep copy the model
                if phase == 'test' and epoch_acc > best_acc:
                    best_acc = epoch_acc
                    torch.save(model.state_dict(), best_model_params_path)

            print()

        time_elapsed = time.time() - since
        print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
        print(f'Best val Acc: {best_acc:4f}')

        # load best model weights
        model.load_state_dict(torch.load(best_model_params_path))
    return model
            

In [None]:
model = train_model(model, loss_function, optimizer, exp_lr_scheduler, num_epochs=30)