In [1]:
import ast
import glob
import json

import os
import re

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

os.environ["TOKENIZERS_PARALLELISM"] = "false"




In [24]:
config_path = "/Users/shannon/Library/CloudStorage/OneDrive-國立臺灣科技大學/NTUST/Germany/HKA/01_AI/Code/AILab2023/input/roberta-large/config.pth"
DATA_PATH = "/Users/shannon/Library/CloudStorage/OneDrive-國立臺灣科技大學/NTUST/Germany/HKA/01_AI/Code/AILab2023/Project/data/"


In [6]:
import torch 
loaded_data = torch.load(config_path)
print(loaded_data)

RobertaConfig {
  "_name_or_path": "roberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.35.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}



In [10]:
class Config:
    # Architecture
    name = "roberta-large"
    num_classes = 1

    # Texts
    max_len = 310
    precompute_tokens = True


# Load data

In [26]:
def process_feature_text(text):
    text = re.sub('I-year', '1-year', text)
    text = re.sub('-OR-', " or ", text)
    text = re.sub('-', ' ', text)
    return text


def clean_spaces(txt):
    txt = re.sub('\n', ' ', txt)
    txt = re.sub('\t', ' ', txt)
    txt = re.sub('\r', ' ', txt)
#     txt = re.sub(r'\s+', ' ', txt)
    return txt


def load_and_prepare_test(root=""):
    patient_notes = pd.read_csv(root + "patient_notes.csv")
    features = pd.read_csv(root + "features.csv")
    df = pd.read_csv(root + "test.csv")

    df = df.merge(features, how="left", on=["case_num", "feature_num"])
    df = df.merge(patient_notes, how="left", on=['case_num', 'pn_num'])

    df['pn_history'] = df['pn_history'].apply(lambda x: x.strip())
    df['feature_text'] = df['feature_text'].apply(process_feature_text)

    df['feature_text'] = df['feature_text'].apply(clean_spaces)
    df['clean_text'] = df['pn_history'].apply(clean_spaces)

    df['target'] = ""
    return df


In [32]:
df = load_and_prepare_test(root=DATA_PATH)
df.head()


Unnamed: 0,id,case_num,pn_num,feature_num,feature_text,pn_history,clean_text,target
0,00016_000,0,16,0,Family history of MI or Family history of myoc...,HPI: 17yo M presents with palpitations. Patien...,HPI: 17yo M presents with palpitations. Patien...,
1,00016_001,0,16,1,Family history of thyroid disorder,HPI: 17yo M presents with palpitations. Patien...,HPI: 17yo M presents with palpitations. Patien...,
2,00016_002,0,16,2,Chest pressure,HPI: 17yo M presents with palpitations. Patien...,HPI: 17yo M presents with palpitations. Patien...,
3,00016_003,0,16,3,Intermittent symptoms,HPI: 17yo M presents with palpitations. Patien...,HPI: 17yo M presents with palpitations. Patien...,
4,00016_004,0,16,4,Lightheaded,HPI: 17yo M presents with palpitations. Patien...,HPI: 17yo M presents with palpitations. Patien...,


## Tokenizer

In [14]:
import numpy as np
from transformers import AutoTokenizer, RobertaTokenizerFast

tokenizer_path = "/Users/shannon/Library/CloudStorage/OneDrive-國立臺灣科技大學/NTUST/Germany/HKA/01_AI/Code/AILab2023/input/roberta-large/tokenizers"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
tokenizer

RobertaTokenizerFast(name_or_path='/Users/shannon/Library/CloudStorage/OneDrive-國立臺灣科技大學/NTUST/Germany/HKA/01_AI/Code/AILab2023/input/roberta-large/tokenizers', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, speci

In [21]:
tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>'}

In [23]:
tokenizer.special_tokens = {
        "sep": tokenizer.sep_token_id,
        "cls": tokenizer.cls_token_id,
        "pad": tokenizer.pad_token_id,
    }
tokenizer.special_tokens

{'sep': 2, 'cls': 0, 'pad': 1}

In [30]:
feature_texts = df["feature_text"].unique()
feature_texts

array(['Family history of MI or Family history of myocardial infarction',
       'Family history of thyroid disorder', 'Chest pressure',
       'Intermittent symptoms', 'Lightheaded'], dtype=object)

### Tokenize feature texts

In [40]:
ids = {}
offsets = {} # for location of each token in the original text

for feature_text in feature_texts:
    encoding = tokenizer(
        feature_text,
        return_token_type_ids=True,
        return_offsets_mapping=True,
        return_attention_mask=False,
        add_special_tokens=False,
    )
    print("="*50)
    print(f'Feature: {feature_text}')
    print(f'input_ids: {encoding["input_ids"]}')
    print(f'offset_mapping: {encoding["offset_mapping"]}')
    encoding_ids = encoding["input_ids"]
    tokens = tokenizer.convert_ids_to_tokens(encoding_ids)
    print(f'convert_ids_to_tokens: {tokens}')
    sen = tokenizer.convert_tokens_to_string(tokens)
    print(f'convert_tokens_to_string: {sen}')

    ids[feature_text] = ids
    offsets[feature_text] = encoding["offset_mapping"]

Feature: Family history of MI or Family history of myocardial infarction
input_ids: [27818, 750, 9, 10931, 50, 3664, 750, 9, 127, 43682, 2617, 4047, 271, 14970]
offset_mapping: [(0, 6), (7, 14), (15, 17), (18, 20), (21, 23), (24, 30), (31, 38), (39, 41), (42, 44), (44, 49), (49, 52), (53, 56), (56, 58), (58, 63)]
convert_ids_to_tokens: ['Family', 'Ġhistory', 'Ġof', 'ĠMI', 'Ġor', 'ĠFamily', 'Ġhistory', 'Ġof', 'Ġmy', 'ocard', 'ial', 'Ġinf', 'ar', 'ction']
convert_tokens_to_string: Family history of MI or Family history of myocardial infarction
Feature: Family history of thyroid disorder
input_ids: [27818, 750, 9, 33670, 8364]
offset_mapping: [(0, 6), (7, 14), (15, 17), (18, 25), (26, 34)]
convert_ids_to_tokens: ['Family', 'Ġhistory', 'Ġof', 'Ġthyroid', 'Ġdisorder']
convert_tokens_to_string: Family history of thyroid disorder
Feature: Chest pressure
input_ids: [48351, 1164]
offset_mapping: [(0, 5), (6, 14)]
convert_ids_to_tokens: ['Chest', 'Ġpressure']
convert_tokens_to_string: Chest pres

## Tokenize the text

In [41]:
texts = df["clean_text"].unique()
texts

array(['HPI: 17yo M presents with palpitations. Patient reports 3-4 months of intermittent episodes of "heart beating/pounding out of my chest." 2 days ago during a soccer game had an episode, but this time had chest pressure and felt as if he were going to pass out (did not lose conciousness). Of note patient endorses abusing adderall, primarily to study (1-3 times per week). Before recent soccer game, took adderrall night before and morning of game. Denies shortness of breath, diaphoresis, fevers, chills, headache, fatigue, changes in sleep, changes in vision/hearing, abdominal paun, changes in bowel or urinary habits.  PMHx: none Rx: uses friends adderrall FHx: mom with "thyroid disease," dad with recent heart attcak All: none Immunizations: up to date SHx: Freshmen in college. Endorses 3-4 drinks 3 nights / week (on weekends), denies tabacco, endorses trying marijuana. Sexually active with girlfriend x 1 year, uses condoms'],
      dtype=object)

In [43]:
for text in texts:
    encoding = tokenizer(
        text,
        return_token_type_ids=True,
        return_offsets_mapping=True,
        return_attention_mask=False,
        add_special_tokens=False,
    )
    print("="*50)
    print(f'Feature: {text}')
    print(f'input_ids: {encoding["input_ids"]}')
    print(f'offset_mapping: {encoding["offset_mapping"]}')
    encoding_ids = encoding["input_ids"]
    tokens = tokenizer.convert_ids_to_tokens(encoding_ids)
    print(f'convert_ids_to_tokens: {tokens}')
    sen = tokenizer.convert_tokens_to_string(tokens)
    print(f'convert_tokens_to_string: {sen}')
    
    ids[text] = encoding["input_ids"]
    offsets[text] = encoding["offset_mapping"]

Feature: HPI: 17yo M presents with palpitations. Patient reports 3-4 months of intermittent episodes of "heart beating/pounding out of my chest." 2 days ago during a soccer game had an episode, but this time had chest pressure and felt as if he were going to pass out (did not lose conciousness). Of note patient endorses abusing adderall, primarily to study (1-3 times per week). Before recent soccer game, took adderrall night before and morning of game. Denies shortness of breath, diaphoresis, fevers, chills, headache, fatigue, changes in sleep, changes in vision/hearing, abdominal paun, changes in bowel or urinary habits.  PMHx: none Rx: uses friends adderrall FHx: mom with "thyroid disease," dad with recent heart attcak All: none Immunizations: up to date SHx: Freshmen in college. Endorses 3-4 drinks 3 nights / week (on weekends), denies tabacco, endorses trying marijuana. Sexually active with girlfriend x 1 year, uses condoms
input_ids: [725, 11337, 35, 601, 9839, 256, 6822, 19, 8750

## Define tokenize function

In [47]:
def precompute_tokens(df, tokenizer):
    feature_texts = df["feature_text"].unique()

    ids = {}
    offsets = {}

    for feature_text in feature_texts:
        encoding = tokenizer(
            feature_text,
            return_token_type_ids=True,
            return_offsets_mapping=True,
            return_attention_mask=False,
            add_special_tokens=False,
        )
        ids[feature_text] = encoding["input_ids"]
        offsets[feature_text] = encoding["offset_mapping"]

    texts = df["clean_text"].unique()

    for text in texts:
        encoding = tokenizer(
            text,
            return_token_type_ids=True,
            return_offsets_mapping=True,
            return_attention_mask=False,
            add_special_tokens=False,
        )
        ids[text] = encoding["input_ids"]
        offsets[text] = encoding["offset_mapping"]
    
    # ids inlcuds text and feature_text tokenize index.
    # offsets inlcuds text and feature_text index of word. 
    return {"ids": ids, "offsets": offsets}


In [53]:
tokenizer.precomputed = precompute_tokens(df, tokenizer)
tokenizer.precomputed

{'ids': {'Family history of MI or Family history of myocardial infarction': [27818,
   750,
   9,
   10931,
   50,
   3664,
   750,
   9,
   127,
   43682,
   2617,
   4047,
   271,
   14970],
  'Family history of thyroid disorder': [27818, 750, 9, 33670, 8364],
  'Chest pressure': [48351, 1164],
  'Intermittent symptoms': [1121, 1279, 2582, 1342, 5298],
  'Lightheaded': [31091, 19279],
  'HPI: 17yo M presents with palpitations. Patient reports 3-4 months of intermittent episodes of "heart beating/pounding out of my chest." 2 days ago during a soccer game had an episode, but this time had chest pressure and felt as if he were going to pass out (did not lose conciousness). Of note patient endorses abusing adderall, primarily to study (1-3 times per week). Before recent soccer game, took adderrall night before and morning of game. Denies shortness of breath, diaphoresis, fevers, chills, headache, fatigue, changes in sleep, changes in vision/hearing, abdominal paun, changes in bowel or ur

## Prepare Dataset

In [118]:
feature_text = df['feature_text'].values[0]
text = df['clean_text'].values[0]
print(feature_text)
print(text)

precomputed = tokenizer.precomputed
print("Tokenized Feature:",precomputed["ids"][feature_text])

# sep is for seperator 
# cls is for classifier
# pad is for padding 
tokens = tokenizer.special_tokens
print("Speical Tokens:", tokens)

input_ids = [tokens["cls"]]+ precomputed["ids"][feature_text]+[tokens["sep"]]
n_question_tokens = len(input_ids)
print("Input ids:", input_ids)


Family history of MI or Family history of myocardial infarction
HPI: 17yo M presents with palpitations. Patient reports 3-4 months of intermittent episodes of "heart beating/pounding out of my chest." 2 days ago during a soccer game had an episode, but this time had chest pressure and felt as if he were going to pass out (did not lose conciousness). Of note patient endorses abusing adderall, primarily to study (1-3 times per week). Before recent soccer game, took adderrall night before and morning of game. Denies shortness of breath, diaphoresis, fevers, chills, headache, fatigue, changes in sleep, changes in vision/hearing, abdominal paun, changes in bowel or urinary habits.  PMHx: none Rx: uses friends adderrall FHx: mom with "thyroid disease," dad with recent heart attcak All: none Immunizations: up to date SHx: Freshmen in college. Endorses 3-4 drinks 3 nights / week (on weekends), denies tabacco, endorses trying marijuana. Sexually active with girlfriend x 1 year, uses condoms
Tok

Make the feature_text and text into one tokenized index.

In [119]:
n_question_tokens = len(input_ids)
# input = tokenized feature + tokenized clean_text (with cls and sep)
input_ids += precomputed["ids"][text]
print("Input ids:", input_ids)
input_tokens = tokenizer.convert_ids_to_tokens(input_ids)
print("Tokens to string:", tokenizer.convert_tokens_to_string(input_tokens))
print("length of input_ids:", len(input_ids))

Input ids: [0, 27818, 750, 9, 10931, 50, 3664, 750, 9, 127, 43682, 2617, 4047, 271, 14970, 2, 725, 11337, 35, 601, 9839, 256, 6822, 19, 8750, 17291, 1635, 4, 27690, 690, 155, 12, 306, 377, 9, 33073, 7585, 9, 22, 12690, 4108, 73, 642, 10773, 66, 9, 127, 7050, 72, 132, 360, 536, 148, 10, 4191, 177, 56, 41, 3238, 6, 53, 42, 86, 56, 7050, 1164, 8, 1299, 25, 114, 37, 58, 164, 7, 1323, 66, 36, 24001, 45, 2217, 10146, 6514, 1825, 322, 1525, 1591, 3186, 253, 34225, 16754, 1606, 254, 1250, 6, 4212, 7, 892, 36, 134, 12, 246, 498, 228, 186, 322, 3224, 485, 4191, 177, 6, 362, 1606, 14385, 1250, 363, 137, 8, 662, 9, 177, 4, 6743, 918, 765, 1825, 9, 8016, 6, 2269, 8258, 4765, 354, 6, 10668, 3697, 6, 1855, 5622, 6, 19344, 6, 16069, 6, 1022, 11, 3581, 6, 1022, 11, 3360, 73, 700, 5867, 6, 28670, 181, 7381, 6, 1022, 11, 29928, 50, 38653, 10095, 4, 1437, 2784, 725, 1178, 35, 4146, 44681, 35, 2939, 964, 1606, 14385, 1250, 274, 725, 1178, 35, 3795, 19, 22, 23875, 36866, 2199, 60, 4252, 19, 485, 1144, 15095

In [120]:
# Make sure the input_ids is not longer than max_len
input_ids = input_ids[: Config.max_len - 1] + [tokens["sep"]]

print("Max len:", Config.max_len)
print("Input ids:", input_ids)
input_tokens = tokenizer.convert_ids_to_tokens(input_ids)
print("Tokens to string:", tokenizer.convert_tokens_to_string(input_tokens))
print("len of input_ids:", len(input_ids))

Max len: 310
Input ids: [0, 27818, 750, 9, 10931, 50, 3664, 750, 9, 127, 43682, 2617, 4047, 271, 14970, 2, 725, 11337, 35, 601, 9839, 256, 6822, 19, 8750, 17291, 1635, 4, 27690, 690, 155, 12, 306, 377, 9, 33073, 7585, 9, 22, 12690, 4108, 73, 642, 10773, 66, 9, 127, 7050, 72, 132, 360, 536, 148, 10, 4191, 177, 56, 41, 3238, 6, 53, 42, 86, 56, 7050, 1164, 8, 1299, 25, 114, 37, 58, 164, 7, 1323, 66, 36, 24001, 45, 2217, 10146, 6514, 1825, 322, 1525, 1591, 3186, 253, 34225, 16754, 1606, 254, 1250, 6, 4212, 7, 892, 36, 134, 12, 246, 498, 228, 186, 322, 3224, 485, 4191, 177, 6, 362, 1606, 14385, 1250, 363, 137, 8, 662, 9, 177, 4, 6743, 918, 765, 1825, 9, 8016, 6, 2269, 8258, 4765, 354, 6, 10668, 3697, 6, 1855, 5622, 6, 19344, 6, 16069, 6, 1022, 11, 3581, 6, 1022, 11, 3360, 73, 700, 5867, 6, 28670, 181, 7381, 6, 1022, 11, 29928, 50, 38653, 10095, 4, 1437, 2784, 725, 1178, 35, 4146, 44681, 35, 2939, 964, 1606, 14385, 1250, 274, 725, 1178, 35, 3795, 19, 22, 23875, 36866, 2199, 60, 4252, 19, 485

In [124]:
token_type_ids = [0] * len(input_ids)
# Offsets
## feature filled with [0,0]; and text filled with precomputed 
offsets = [(0, 0)] * n_question_tokens + precomputed["offsets"][text]
print("len of offsets:", len(offsets))
## make sure the offsets is not longer than max_len + <sep>
offsets = offsets[: Config.max_len - 1] + [(0, 0)]
print("Offsets:", offsets)
print("len of offsets:", len(offsets))

len of offsets: 247
Offsets: [(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 1), (1, 3), (3, 4), (5, 7), (7, 9), (10, 11), (12, 20), (21, 25), (26, 29), (29, 32), (32, 38), (38, 39), (40, 47), (48, 55), (56, 57), (57, 58), (58, 59), (60, 66), (67, 69), (70, 82), (83, 91), (92, 94), (95, 96), (96, 101), (102, 109), (109, 110), (110, 111), (111, 118), (119, 122), (123, 125), (126, 128), (129, 134), (134, 136), (137, 138), (139, 143), (144, 147), (148, 154), (155, 156), (157, 163), (164, 168), (169, 172), (173, 175), (176, 183), (183, 184), (185, 188), (189, 193), (194, 198), (199, 202), (203, 208), (209, 217), (218, 221), (222, 226), (227, 229), (230, 232), (233, 235), (236, 240), (241, 246), (247, 249), (250, 254), (255, 258), (259, 260), (260, 263), (264, 267), (268, 272), (273, 277), (277, 281), (281, 285), (285, 287), (288, 290), (291, 295), (296, 303), (304, 307), (307, 312), (313, 320), (321, 324),

In [136]:
# Padding
padding_length = Config.max_len - len(input_ids)
print("Padding length:", padding_length)
if padding_length > 0:
    # due to the max_len is 310, we need to pad until the length is 310.
    input_ids = input_ids + ([tokens["pad"]] * padding_length)
    # token_type_ids is used to distinguish different sentences.
    # if we add padding, we need to add 0 to token_type_ids.
    token_type_ids = token_type_ids + ([0] * padding_length)
    offsets = offsets + ([(0, 0)] * padding_length)

print("Input ids:", input_ids)
print("len of input_ids:", len(input_ids))


Padding length: 0
Input ids: [0, 27818, 750, 9, 10931, 50, 3664, 750, 9, 127, 43682, 2617, 4047, 271, 14970, 2, 725, 11337, 35, 601, 9839, 256, 6822, 19, 8750, 17291, 1635, 4, 27690, 690, 155, 12, 306, 377, 9, 33073, 7585, 9, 22, 12690, 4108, 73, 642, 10773, 66, 9, 127, 7050, 72, 132, 360, 536, 148, 10, 4191, 177, 56, 41, 3238, 6, 53, 42, 86, 56, 7050, 1164, 8, 1299, 25, 114, 37, 58, 164, 7, 1323, 66, 36, 24001, 45, 2217, 10146, 6514, 1825, 322, 1525, 1591, 3186, 253, 34225, 16754, 1606, 254, 1250, 6, 4212, 7, 892, 36, 134, 12, 246, 498, 228, 186, 322, 3224, 485, 4191, 177, 6, 362, 1606, 14385, 1250, 363, 137, 8, 662, 9, 177, 4, 6743, 918, 765, 1825, 9, 8016, 6, 2269, 8258, 4765, 354, 6, 10668, 3697, 6, 1855, 5622, 6, 19344, 6, 16069, 6, 1022, 11, 3581, 6, 1022, 11, 3360, 73, 700, 5867, 6, 28670, 181, 7381, 6, 1022, 11, 29928, 50, 38653, 10095, 4, 1437, 2784, 725, 1178, 35, 4146, 44681, 35, 2939, 964, 1606, 14385, 1250, 274, 725, 1178, 35, 3795, 19, 22, 23875, 36866, 2199, 60, 4252, 19

In [142]:
encoding = {
        "input_ids": input_ids,
        "token_type_ids": token_type_ids,
        "offset_mapping": offsets,
    }
encoding

{'input_ids': [0,
  27818,
  750,
  9,
  10931,
  50,
  3664,
  750,
  9,
  127,
  43682,
  2617,
  4047,
  271,
  14970,
  2,
  725,
  11337,
  35,
  601,
  9839,
  256,
  6822,
  19,
  8750,
  17291,
  1635,
  4,
  27690,
  690,
  155,
  12,
  306,
  377,
  9,
  33073,
  7585,
  9,
  22,
  12690,
  4108,
  73,
  642,
  10773,
  66,
  9,
  127,
  7050,
  72,
  132,
  360,
  536,
  148,
  10,
  4191,
  177,
  56,
  41,
  3238,
  6,
  53,
  42,
  86,
  56,
  7050,
  1164,
  8,
  1299,
  25,
  114,
  37,
  58,
  164,
  7,
  1323,
  66,
  36,
  24001,
  45,
  2217,
  10146,
  6514,
  1825,
  322,
  1525,
  1591,
  3186,
  253,
  34225,
  16754,
  1606,
  254,
  1250,
  6,
  4212,
  7,
  892,
  36,
  134,
  12,
  246,
  498,
  228,
  186,
  322,
  3224,
  485,
  4191,
  177,
  6,
  362,
  1606,
  14385,
  1250,
  363,
  137,
  8,
  662,
  9,
  177,
  4,
  6743,
  918,
  765,
  1825,
  9,
  8016,
  6,
  2269,
  8258,
  4765,
  354,
  6,
  10668,
  3697,
  6,
  1855,
  5622,
  6,
  19344,
  

In [143]:
{
    "ids": torch.tensor(encoding["input_ids"], dtype=torch.long),
    "token_type_ids": torch.tensor(encoding["token_type_ids"], dtype=torch.long),
    "offsets": np.array(encoding["offset_mapping"]),
    "text": text,
}

{'ids': tensor([    0, 27818,   750,     9, 10931,    50,  3664,   750,     9,   127,
         43682,  2617,  4047,   271, 14970,     2,   725, 11337,    35,   601,
          9839,   256,  6822,    19,  8750, 17291,  1635,     4, 27690,   690,
           155,    12,   306,   377,     9, 33073,  7585,     9,    22, 12690,
          4108,    73,   642, 10773,    66,     9,   127,  7050,    72,   132,
           360,   536,   148,    10,  4191,   177,    56,    41,  3238,     6,
            53,    42,    86,    56,  7050,  1164,     8,  1299,    25,   114,
            37,    58,   164,     7,  1323,    66,    36, 24001,    45,  2217,
         10146,  6514,  1825,   322,  1525,  1591,  3186,   253, 34225, 16754,
          1606,   254,  1250,     6,  4212,     7,   892,    36,   134,    12,
           246,   498,   228,   186,   322,  3224,   485,  4191,   177,     6,
           362,  1606, 14385,  1250,   363,   137,     8,   662,     9,   177,
             4,  6743,   918,   765,  1825,  

## DataSet

In [None]:
import torch
from torch.utils.data import Dataset

class PatientNoteDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.max_len = max_len
        self.tokenizer = tokenizer

        self.texts = df['clean_text'].values 
        self.feature_text = df['feature_text'].values
        self.char_targets = df['target'].values.tolist()

    def __getitem__(self, idx):
        text = self.texts[idx]
        feature_text = self.feature_text[idx]
        char_target = self.char_targets[idx]

        
        encoding = self.tokenizer(
            feature_text,
            text,
            return_token_type_ids=True,
            return_offsets_mapping=True,
            return_attention_mask=False,
            truncation="only_second",
            max_length=self.max_len,
            padding='max_length',
        )

        return {
            "ids": torch.tensor(encoding["input_ids"], dtype=torch.long),
            "token_type_ids": torch.tensor(encoding["token_type_ids"], dtype=torch.long),
            "target": torch.tensor([0], dtype=torch.float),
            "offsets": np.array(encoding["offset_mapping"]),
            "text": text,
        }

    def __len__(self):
        return len(self.texts)

In [None]:
dataset = PatientNoteDataset(
        df,
        tokenizer,
        max_len=Config.max_len,
    )