In [None]:
!pip install transformers==4.5.0
!pip install datasets
!git clone https://github.com/Shaumik-Ashraf/BART-MIMIC-CXR.git
!pip install -r /content/BART-MIMIC-CXR/transformers/seq2seq/requirements.txt
import torch
from transformers import BartModel, BartTokenizer, BartForConditionalGeneration, BartConfig
from transformers import Trainer, TrainingArguments
from transformers.models.bart.modeling_bart import shift_tokens_right
import csv
import numpy as np
import pandas as pd
import os

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Collecting transformers==4.5.0
[?25l  Downloading https://files.pythonhosted.org/packages/81/91/61d69d58a1af1bd81d9ca9d62c90a6de3ab80d77f27c5df65d9a2c1f5626/transformers-4.5.0-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.2MB 11.0MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 38.9MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/08/cd/342e584ee544d044fb573ae697404ce22ede086c9e87ce5960772084cad0/sacremoses-0.0.44.tar.gz (862kB)
[K     |████████████████████████████████| 870kB 37.4MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.44-cp37-none-any.whl size=886084 sha256=582

In [None]:
from google.colab import drive
drive.mount('/content/drive')
directory = '/content/drive/MyDrive/NLP in Health'

Mounted at /content/drive


In [None]:
def load_file(filename):
	"""
	loads csv data and returns it as np matrix
	
	param: filename - path to csv file
	return: 2D numpy of csv data with text
	"""
	print(f"Loading data from {filename}...");
	df = pd.read_csv(filename)
	print(f"Done.");
	return( np.array(df) );

def load_bart(model_name='facebook/bart-large-cnn', tokenizer_name='facebook/bart-large'):
	"""
	loads pretrained BART model and tokenizer
	
	params: model_name - pretrained BART huggingface transformer download path, default: facebook/bart-large-cnn
		    tokenizer_name - pretrained BART huggingface tokenizer download path, default: facebook/bart-large
	return: (model, tokenizer)
	"""
	print(f"Loading pretrained model {model_name}...");
	model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
	print("Done.");
	print(f"Loading pretrained tokenizer {tokenizer_name}...");
	tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
	print("Done.");
	return((model, tokenizer));

def baseBart(article_to_summarize, model, tokenizer):
	"""
	runs BART summarization
	
	params: model - from load_bart()
		    tokenizer - from load_bart()
			article_to_summarize - text (string)
	return: generated abstractive summary (string)
	"""
	inputs = tokenizer([article_to_summarize], max_length=1024, return_tensors='pt')
	inputs.to(device)
	summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=25, early_stopping=True)
	return [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids][0]

def write_csv_row(opened_file, row, model, tokenizer):
	"""
	generates abstractive summary and writes it to a file in csv format, 1 summary per row
	
	params: opened_file - open File object, actually any IO stream implementing write() works
		    row - a list/array containing [<subject id>, <study id>, <text to summarize>, <ground truth summary>]
			model - trained BART model
			tokenizer - BART tokenizer
	returns: generated summary
	"""
	comp_summary = baseBart(row[2], model, tokenizer)
	opened_file.write(f"\"{row[0]}\",\"{row[1]}\",\"{comp_summary}\",\"{row[3]}\"\n");
	return(comp_summary);

In [None]:
TEST_FILE = '/content/test.csv'
LIMIT = -1
SUMMARIES_FILE = '/content/drive/MyDrive/NLP in Health/Summaries_' + str(LIMIT) + '.csv'

In [None]:
print("==================== Start abstractive summarization ======================");

data = load_file(TEST_FILE);
model, tokenizer = load_bart();
model.to(device)

print(f"Writing {os.path.basename(SUMMARIES_FILE)}...");
f = open(SUMMARIES_FILE, 'w');
f.write(f"\"subject_id\",\"study_id\",\"prediction\",\"actual\"\n");
i = 0;
if LIMIT==-1: # based on the limit, print progress messages appropriately
	for row in data:
		write_csv_row(f, row, model, tokenizer);
		if( (i%1000 == 0) or (i+1 == LIMIT) ):
			print(f"Computed {i+1} summaries");
		i += 1;
elif LIMIT < 100:
	for row in data[:LIMIT]:
		write_csv_row(f, row, model, tokenizer);
		if( (i%(int(LIMIT/4)) == 0) or (i+1 == LIMIT)):
			print(f"Computed {i+1} summaries");
		i += 1;
else:
	for row in data[:LIMIT]:
		write_csv_row(f, row, model, tokenizer);
		if( (i%(int(LIMIT/8)) == 0) or (i+1 == LIMIT) ):
			print(f"Computed {i+1} summaries");
		i += 1;

f.close();
print("Done.\n");
print("==================== End abstractive summarization ======================");

Loading data from /content/test.csv...
Done.
Loading pretrained model facebook/bart-large-cnn...
Done.
Loading pretrained tokenizer facebook/bart-large...
Done.


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Writing Summaries_-1.csv...
Computed 1 summaries
Computed 1001 summaries
Computed 2001 summaries
Computed 3001 summaries
Computed 4001 summaries
Computed 5001 summaries
Computed 6001 summaries
Computed 7001 summaries
Computed 8001 summaries
Computed 9001 summaries
Computed 10001 summaries
Computed 11001 summaries
Computed 12001 summaries
Done.



In [None]:
!python /content/BART-MIMIC-CXR/transformers/seq2seq/run_summarization.py --model_name facebook/bart-large-cnn --tokenizer_name facebook/bart-large --output_dir output_small --train_file /content/BART-MIMIC-CXR/data/train_small.csv --validation_file /content/BART-MIMIC-CXR/data/val_small.csv --test_file /content/BART-MIMIC-CXR/data/test_small.csv --text_column findings --summary_column impression --max_source_length 1024 --max_target_length 25 --num_beams 4 --do_train --do_eval --do_predict --predict_with_generate

2021-04-12 04:20:54.330449: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
04/12/2021 04:20:55 - INFO - __main__ -   Training/evaluation parameters Seq2SeqTrainingArguments(output_dir='output_small', overwrite_output_dir=False, do_train=True, do_eval=True, do_predict=True, evaluation_strategy=<IntervalStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=8, per_device_eval_batch_size=8, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=<SchedulerType.LINEAR: 'linear'>, warmup_ratio=0.0, warmup_steps=0, logging_dir='runs/Apr12_04-20-55_f15633b10ab8', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=False, logging_steps=500, save_strategy=<I

In [None]:
!python /content/BART-MIMIC-CXR/transformers/seq2seq/run_summarization.py --model_name facebook/bart-large-cnn --tokenizer_name facebook/bart-large --output_dir '/content/drive/MyDrive/NLP in Health/Output' --train_file '/content/BART-MIMIC-CXR/data/train.csv' --validation_file '/content/BART-MIMIC-CXR/data/validation.csv' --test_file '/content/BART-MIMIC-CXR/data/test.csv' --text_column findings --summary_column impression --max_source_length 1024 --max_target_length 25 --num_beams 4 --do_train --do_eval --do_predict --predict_with_generate

2021-04-12 04:25:46.402949: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
04/12/2021 04:25:47 - INFO - __main__ -   Training/evaluation parameters Seq2SeqTrainingArguments(output_dir='/content/drive/MyDrive/NLP in Health/Output', overwrite_output_dir=False, do_train=True, do_eval=True, do_predict=True, evaluation_strategy=<IntervalStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=8, per_device_eval_batch_size=8, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=<SchedulerType.LINEAR: 'linear'>, warmup_ratio=0.0, warmup_steps=0, logging_dir='runs/Apr12_04-25-47_f15633b10ab8', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=False, logg

In [None]:
import regex as re

# Eliminate everything after the last period
summary_file = '/content/drive/MyDrive/NLP in Health/Output/test_generations.txt'
truncated_file = '/content/drive/MyDrive/NLP in Health/Output/truncated_generations.txt'
with open(summary_file, 'r') as f:
    summaries = [line.strip() for line in f]

truncated_summaries = []
for text in summaries:
  text = re.sub("(?<=[a-zA-Z]\.)[^.]*$", "", text)
  truncated_summaries.append(text)

with open(truncated_file, "w") as f:
  for item in truncated_summaries:
    f.write(str(item) + '\n')


In [None]:
print(len(summaries))
print(len(truncated_summaries))

12069
12069
