In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
cd ../src

In [None]:
import os
import ast
import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter
from tqdm.notebook import tqdm

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
from data.preparation import *
from data.dataset import *
from data.processing import *
from data.tokenization import *
from params import *
from training.main import k_fold
from models import NERTransformer

from utils.plot import plot_annotation
from utils.logger import prepare_log_folder, save_config, create_logger

### Save configs

In [None]:
from data.tokenization import *
from transformers import AutoConfig
import torch

for name in tqdm([
#     "roberta-base", 
#     "roberta-large", 
#     "microsoft/deberta-base",  
#     "microsoft/deberta-large",
#     "microsoft/deberta-v3-large",
#     "microsoft/deberta-xlarge",
#     "google/electra-large-discriminator"
]):

    tokenizer = get_tokenizer(name)
    tokenizer.save_pretrained(OUT_PATH + f'{name.split("/")[-1]}/tokenizers/')

    config = AutoConfig.from_pretrained(name, output_hidden_states=True)
    torch.save(config, OUT_PATH + f'{name.split("/")[-1]}/' +'config.pth')

In [None]:
%%time
df = load_and_prepare_pretrain('../output/pl_6/df_pl.csv', root=DATA_PATH)

In [None]:
%%time
df = pd.read_csv('../output/pl_6/df_pl.csv')

In [None]:
%%time
probs = np.load('../output/pl_6/probs.npy')

In [None]:
%%time
df['soft_target'] = probs.tolist()

In [None]:
%%time
df['soft_target'] = df.apply(lambda x: np.array(x.soft_target[:len(x.pn_history)]), 1)

In [None]:
tqdm.pandas()
df['target'] = df['soft_target'].progress_apply(lambda x: np.array(x) > 0.5)

In [None]:
df_ = df[df['pn_num'] < 10].reset_index()
df_['span'] = df_['target'].progress_apply(char_target_to_span)

In [None]:
plot_annotation(df_, 4)

In [None]:
# df = pd.read_csv('../output/pl_5/df_pl.csv')
# probs = np.zeros((len(df), 950))

# for i in range(len(df)):
#     if i % 25000 == 0:
#         print(i)
#     x = np.array(ast.literal_eval(re.sub('\n', ',', df['probs'][i])))[:, 0]
#     probs[i, :len(x)] = x
    
# np.save('../output/pl_5/probs.npy', probs)
# df[['id', 'case_num', 'pn_num', 'feature_num', 'feature_text', 'pn_history',
#        'ft_ref', 'text_ref']].to_csv('../output/pl_5/df_pl_.csv')

## Data

In [None]:
df = pd.read_csv(DATA_PATH + "patient_notes.csv")

df['len'] = df['pn_history'].apply(len)

len(df)

In [None]:
df = df[df['len'] == 950].reset_index(drop=True)
len(df)

In [None]:
dfgm = df.groupby('case_num')['pn_num'].agg(lambda x: np.max(list(x)) % 10000)

In [None]:
dfgl = df.groupby('case_num')['pn_num'].agg(lambda x: len(list(x)))

In [None]:
df['pn_history'][2]

In [None]:
df['clean_text'] = df['pn_history'].apply(lambda x: x.strip()).apply(clean_spaces)

In [None]:
dfc = df[df["clean_text"].apply(lambda x: x[0]) !=  df["pn_history"].apply(lambda x: x[0])]

In [None]:
df = load_and_prepare(root=DATA_PATH)

In [None]:
df = df.drop_duplicates(subset='pn_history', keep='first')

In [None]:
df['len'] = df['pn_history'].apply(len)

# df = df[df['len'] == 950].reset_index(drop=True)

In [None]:
df['len'] = df['pn_history'].apply(len)
len(df[df['len'] == 950]['pn_history'].unique())

In [None]:
df['len'].value_counts()[:50]

In [None]:
dfd = df[df['pn_history'] != df['pn_history'].apply(lambda x: x.strip())]

In [None]:
dfd = dfd[dfd['target'].apply(lambda x: x[-5:].max() > 0)].reset_index(drop=True)
dfd['len'] = dfd['pn_history'].apply(len)


In [None]:
dfd.reset_index(drop=True)

In [None]:
folds = pd.read_csv(OUT_PATH + "folds.csv")

In [None]:
name = "microsoft/deberta-v3-base"

In [None]:
tokenizer = get_tokenizer(name, precompute=True, df=df, add_special_tokens=True)

In [None]:
tokenizer('family history of mi or family history of myocardial infarction')

In [None]:
df['len'] = df['pn_history'].apply(len)
df = df[df['len'] == 950].reset_index()

In [None]:
for i in df['pn_num'].unique():
    plot_annotation(df, i)
    print('')

In [None]:
# lens = []

# for i in tqdm(range(len(df))):
#     lens.append(
#         len(tokenizer(
#             clean_spaces(df['feature_text'][i].lower()),
#             clean_spaces(df['pn_history'][i].lower()),
#         )['input_ids'])
#     )
# np.max(lens)

In [None]:
dataset = PatientNoteDataset(df, tokenizer, max_len=310)

In [None]:
sns.displot(df['pn_history'].apply(lambda x: len(x.split())))

In [None]:
# from data.loader import define_loaders

# train_loader, val_loader = define_loaders(dataset, dataset, val_bs=2)
# for batch in val_loader:
#     print(batch['offsets'], batch['offsets'].size())
#     break

In [None]:
# for i in tqdm(range(len(dataset))):
#     data = dataset[i]
    
#     assert len(data['text']) == np.max(data['offsets']), i

# Model

In [None]:
model = NERTransformer("roberta-base")

In [None]:
model