In [45]:
# CNN
import torch.nn.functional as F
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn

# others
import numpy as np
import matplotlib.pyplot as plt
import time
import os
from PIL import Image
from tempfile import TemporaryDirectory
import time

# dataset
import torchvision
from torchvision import datasets, models, transforms
from torchvision.datasets import Flowers102

# read file 
import pandas as pd

# label
from scipy.io import loadmat
import json
from tqdm import tqdm
from itertools import islice

# Use TweetEval emotion recognition dataset 

In [2]:
# root = '../../Data/tweeteval/datasets/emotion/'
# mapping_file = os.path.join(root, 'mapping.txt')
# test_labels_file = os.path.join(root, 'test_labels.txt')
# test_text_file = os.path.join(root, 'test_text.txt')
# train_labels_file = os.path.join(root, 'train_labels.txt')
# train_text_file = os.path.join(root, 'train_text.txt')
# val_labels_file = os.path.join(root, 'val_labels.txt')
# val_text_file = os.path.join(root, 'val_text.txt')

In [3]:
# mapping_pd = pd.read_csv(mapping_file, sep='\t', header=None)
# test_label_pd = pd.read_csv(test_labels_file, sep='\t', header=None)
# test_dataset = open(test_text_file).read().split('\n')[:-1] # remove last empty line 
# train_label_pd = pd.read_csv(train_labels_file, sep='\t', header=None)
# train_dataset = open(train_text_file).read().split('\n')[:-1] # remove last empty line
# val_label_pd = pd.read_csv(val_labels_file, sep='\t', header=None)
# val_dataset = open(val_text_file).read().split('\n')[:-1] # remove last empty line

# Preprocess training data
- Given: Notes with ranges and labels
- Transform into label + lists of tokens with [does token describe label]

In [4]:
root = './data/'
features_path = os.path.join(root, 'features.csv')
patient_notes_path = os.path.join(root, 'patient_notes.csv')
sample_submission_path = os.path.join(root, 'sample_submission.csv')
test_path = os.path.join(root, 'test.csv')
train_path = os.path.join(root, 'train.csv')
features = pd.read_csv(features_path, sep=',', header=0)
patient_notes = pd.read_csv(patient_notes_path, sep=',', header=0)
train_raw = pd.read_csv(train_path, sep=',', header=0)


## intro 
- `case_num`: 0~9, each num belongs their groups ... ? 
- `pn_num`: the id in patient_notes.csv which is 'pn_history', present the note of each case 
- `feature_num`: the id in features.csv which is 'feature_num', present the feature of each case 
- `location`: 

In [5]:
import re
def df_string2list_of_ints(df_string: str):
    df_string = df_string.strip("[]")
    if df_string == "":
        return []
    entries = re.split(",|;", df_string)
    entries = [entry.strip(" '") for entry in entries]
    ranges = [tuple(int(num_as_str) for num_as_str in entry.split(" ")) for entry in entries]
    return ranges

In [6]:
data_merged = train_raw.merge(features, on=['feature_num', 'case_num'], how='left')
data_merged = data_merged.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
data_merged["location"] = data_merged["location"].apply(df_string2list_of_ints)
data_merged.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history
0,00016_000,0,16,0,['dad with recent heart attcak'],"[(696, 724)]",Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...
1,00016_001,0,16,1,"['mom with ""thyroid disease']","[(668, 693)]",Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...
2,00016_002,0,16,2,['chest pressure'],"[(203, 217)]",Chest-pressure,HPI: 17yo M presents with palpitations. Patien...
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","[(70, 91), (176, 183)]",Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...
4,00016_004,0,16,4,['felt as if he were going to pass out'],"[(222, 258)]",Lightheaded,HPI: 17yo M presents with palpitations. Patien...


In [7]:
train = data_merged[["feature_num", "pn_history", "location", ]]
train.head()

Unnamed: 0,feature_num,pn_history,location
0,0,HPI: 17yo M presents with palpitations. Patien...,"[(696, 724)]"
1,1,HPI: 17yo M presents with palpitations. Patien...,"[(668, 693)]"
2,2,HPI: 17yo M presents with palpitations. Patien...,"[(203, 217)]"
3,3,HPI: 17yo M presents with palpitations. Patien...,"[(70, 91), (176, 183)]"
4,4,HPI: 17yo M presents with palpitations. Patien...,"[(222, 258)]"


In [8]:
# filter training data with no location
train = train[train["location"].apply(lambda row: len(row) != 0)]

In [9]:
print(f'Size of dataset= {len(train)}')

Size of dataset= 9901


## Tokenization
- Use spaCy to split the notes into words.

Before start using spaCy
```
conda install -c conda-forge spacy
python -m spacy download en_core_web_sm
```

In [10]:
import spacy 
from collections import Counter

# use spacy to tokenize the sentence with english model 
nlp = spacy.load("en_core_web_sm")


In [16]:
# Create vocabulary by getting the most common words across (unique) patient histories
import pickle
import os
from os.path import join as pathjoin

cache_dir = "cache"
cache_file = pathjoin(cache_dir, "vocab.pkl")
if os.path.isfile(cache_file):
    with open(cache_file, "rb") as f:
        vocab = pickle.load(f)
    print("Vocabulary loaded from cache.")
else:
    print("Found no cached vocabulary. Creating...")
    text_to_count_tokens = ' '.join(train["pn_history"].drop_duplicates())
    # use spacy to tokenize the sentence 
    doc = nlp(text_to_count_tokens)
    # Get the most frequent words, filtering out stop words and punctuation.
    word_freq = Counter(token.text.lower() for token in doc if \
                        not token.is_punct and \
                            not token.is_stop and \
                                not token.is_space)

    most_common_words = word_freq.most_common(5000)
    vocab = {word[0]: idx for idx, word in enumerate(most_common_words)}
    with open(cache_file, "wb") as f:
        pickle.dump(vocab, f)

print("Top 10 words: ", ", ".join(list(vocab)[:10]))
# [(k, v) for k, v in vocab.items() if v == 0]

Top 10 words:  pain, 2, denies, ago, 3, pmh, months, changes, 4, use


In [38]:
from typing import Dict, List

placeholder_index = 5000



cache_file = pathjoin(cache_dir, "tokenized_pn_histories.pkl")
if os.path.isfile(cache_file):
    with open(cache_file, "rb") as f:
        tokenized_pn_histories = pickle.load(f)
    print("Tokenized patient histories loaded from cache.")
else:
    print("Found no cached tokenized patient histories. Tokenizing...")
    tokenized_pn_histories: Dict[str, List[str]] = {}
    for pn_history in tqdm(train["pn_history"]):
        indexed_words = []
        if pn_history in tokenized_pn_histories:
            continue
        for token in nlp(pn_history):
            if not token.is_punct and not token.is_stop and not token.is_space:
                word = token.text.lower()
                start_idx = token.idx
                end_idx = token.idx + len(token.text)

                word_as_number = vocab[word] if word in vocab else placeholder_index
                
                indexed_words.append({
                    "word_idx": word_as_number,
                    "start": start_idx,
                    "end": end_idx
                })
                    
        tokenized_pn_histories[pn_history] = indexed_words
    with open(cache_file, "wb") as f:
        pickle.dump(tokenized_pn_histories, f)


Found no cached tokenized patient histories. Tokenizing...


100%|██████████| 9901/9901 [00:47<00:00, 208.33it/s]


- Follow the example described here. Use the same architecture, but:
  - only use the last output of the LSTM in the loss function
  - use an embedding dim of 128
  - use a hidden dim of 256.  

## Get feature-relevancy of tokens via char ranges

In [94]:
total_feature_num = lentrain["feature_num"].drop_duplicates()
# def encode_feature(feature_num: int) -> np.array:
    # return   

0          0
1          1
2          2
3          3
4          4
        ... 
12637    903
12638    904
12641    907
12673    905
13223    911
Name: feature_num, Length: 143, dtype: int64

In [89]:
train_tokens_with_scores = dict()
for i, (feature_num, pn_history, location) in tqdm(train.iterrows()):
    tokenized_history = tokenized_pn_histories[pn_history]
    tokens_with_scores = []
    for token in tokenized_history:
        for feature_relevant_range in location:
            token_start, token_end = token["start"], token["end"]
            range_start, range_end = feature_relevant_range[0], feature_relevant_range[1]
            
            percentage_of_token_in_range = max(min(token_end, range_end)+1 - max(token_start, range_start), 0) / (token_end+1 - token_start)
            # if percentage_of_token_in_range > 0:
            #     print(percentage_of_token_in_range, token, feature_relevant_range)
            tokens_with_scores.append({"feature_num": feature_num, "word": token["word_idx"], "score": int(percentage_of_token_in_range > 0.9)})
    train_tokens_with_scores[i] = tokens_with_scores
        

9901it [00:05, 1657.50it/s]


In [101]:
print("data format:")
train_tokens_with_scores[0][:5]

data format:


[{'feature_num': 0, 'word': 75, 'score': 0},
 {'feature_num': 0, 'word': 599, 'score': 0},
 {'feature_num': 0, 'word': 116, 'score': 0},
 {'feature_num': 0, 'word': 45, 'score': 0},
 {'feature_num': 0, 'word': 44, 'score': 0}]

# TODO: Bring feature label into training data for LSTM!
- must encode the feature label (one out of 143 numbers) into the LSTM input data to train. How to do it?

In [None]:
m

In [None]:
mapping = dict(zip(mapping_pd[0], mapping_pd[1]))
mapping

In [None]:
EMBEDDING_DIM = 128 # 將word轉換成維度為128的向量
HIDDEN_DIM = 256 # 在RNN或LSTM中模型中隱藏曾神經元的數量大小
word_to_ix = vocab # 詞彙表
tag_to_ix = mapping # 標籤表

In [None]:
class LSTMTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size, dropout=0.0):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, dropout=dropout)

        self.hidden2score = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        # Take only the last output of the LSTM
        # last_output = lstm_out[-1].view(1, -1)  # Selecting the last output
        output = lstm_out.view(len(sentence), -1)
        tag_space = self.hidden2score(output) # 將LSTM模型的最後輸出轉換成 詞標籤 空間
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [None]:
len(word_to_ix)+1

In [None]:
# vocab_size 要添加 1 因為如果 sentence 中有出現沒在 vocab 中的單字，使用 5000 來代替，所以要加 1
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix)+1, len(tag_to_ix))
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

In [None]:
# for sentence to sequence 
def prepare_sentence_sequence(seq, to_ix):
    idx = []
    # use spacy to tokenize the sentence 
    for token in nlp(seq):
        # filter out the punctuation and stop words and space 
        if not token.is_punct and not token.is_stop and not token.is_space:
            word = token.text
            # if the token is in the top 5000 words in the vocab, add its index to the list
            if word in to_ix:
                idx.append(to_ix[word])
            else:
                # else add the index of the placeholder token
                idx.append(placeholder_index)
    return torch.tensor(idx, dtype=torch.long)

In [None]:
def one_hot_encode(val, to_ix):
    result = []
    for k, v in to_ix.items(): 
        if val == k:
            result.append(1)
        else:
            result.append(0)
    return torch.tensor(result, dtype=torch.float32)

In [None]:
print(mapping)
print(one_hot_encode(2, tag_to_ix))

In [None]:

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i. ; j有3個、i有5個字，i, j 表示第i個字的第j個tag的分數
# Here we don't need to train, so the code is wrapped in torch.no_grad()
sentence_idx = 1
with torch.no_grad():
    inputs = prepare_sentence_sequence(train_dataset[sentence_idx], word_to_ix)
    labels = one_hot_encode(train_label_pd[0][sentence_idx], tag_to_ix)
    outputs = model(inputs)
    _, preds = torch.max(outputs, 1)
    result_idx = torch.argmax(outputs).item()
    loss = loss_function(outputs[0], labels)

    print(f'First Sentense = {train_dataset[sentence_idx]}')
    print(f'Sentense to tensor = {inputs}')
    print(f'Sentense of result to tensor = {labels}')
    print(f'tag_scores = {outputs}')
    print(f'loss = {loss}')
    print(f'preds = {preds}')
    print(f'result = {result_idx}, ans = {train_label_pd[0][sentence_idx]}')

In [None]:
dataloaders = {'train': train_dataset, 'test': test_dataset}
resultloaders = {'train': train_label_pd[0].tolist(), 'test': test_label_pd[0].tolist()}
dataset_sizes = {x: len(dataloaders[x]) for x in ['train', 'test']}
dataset_sizes

In [None]:
phase = 'train'

len(resultloaders[phase])

In [None]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=1):
    since = time.time()
    with TemporaryDirectory() as tempdir:
        best_model_params_path = os.path.join(tempdir, 'best_model_params.pt')

        torch.save(model.state_dict(), best_model_params_path)
        best_acc = 0.0

        for epoch in range(num_epochs):
            print(f'Epoch {epoch}/{num_epochs - 1}')
            print('-' * 10)

            # Each epoch has a training and validation phase
            for phase in ['train', 'test']:
                if phase == 'train':
                    model.train()
                else: 
                    model.eval()
                
                running_loss = 0.0
                running_corrects = 0

                # Iterate over data.
                for input, label in zip(dataloaders[phase], resultloaders[phase]):
                    inputs_vector = prepare_sentence_sequence(input, word_to_ix)
                    labels_vector = one_hot_encode(label, tag_to_ix)
                    
                    # zero the parameter gradients
                    optimizer.zero_grad()

                    # forward
                    # track history if only in train
                    with torch.set_grad_enabled(phase == 'train'):
                        outputs = model(inputs_vector) # 取得針對每個emotion的預測結果tensor (e.g. tensor([[-1.3948, -1.4476, -1.3804, -1.3261]]))
                        pred = torch.argmax(outputs).item() # 取得最大值的index (e.g. 2)
                        loss = criterion(outputs[0], labels_vector) # 外面還有一層，只需取得內層 [-1.3948, -1.4476, -1.3804, -1.3261] 與 [0, 0, 1, 0] 的計算loss

                        # backward + optimize only if in training phase
                        if phase == 'train':
                            loss.backward()
                            optimizer.step()

                    # statistics
                    running_loss += loss.item()
                    if pred == label:
                        running_corrects += 1

                if phase == 'train':
                    scheduler.step()

                epoch_loss = running_loss / dataset_sizes[phase]
                epoch_acc = running_corrects / dataset_sizes[phase]
                print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f} Time elapsed: {round((time.time() - since))} sec.')
                
                # deep copy the model
                if phase == 'test' and epoch_acc > best_acc:
                    best_acc = epoch_acc
                    torch.save(model.state_dict(), best_model_params_path)

            print()

        time_elapsed = time.time() - since
        print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
        print(f'Best val Acc: {best_acc:4f}')

        # load best model weights
        model.load_state_dict(torch.load(best_model_params_path))
    return model
            

In [None]:
model = train_model(model, loss_function, optimizer, exp_lr_scheduler, num_epochs=30)

In [None]:
# vocab_size 要添加 2 因為如果 sentence 中有出現沒在 vocab 中的單字，使用 5001 來代替，所以要加 1
model_LSTM = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix)+1, len(tag_to_ix), dropout=0.5)
loss_function_LSTM = nn.CrossEntropyLoss()
optimizer_LSTM = optim.SGD(model_LSTM.parameters(), lr=0.001, momentum=0.9)
exp_lr_scheduler_LSTM = lr_scheduler.StepLR(optimizer_LSTM, step_size=7, gamma=0.1)

In [None]:
modelLSTM = train_model(model_LSTM, loss_function_LSTM, optimizer_LSTM, exp_lr_scheduler_LSTM, num_epochs=30)