In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
!pip install rouge-score py7zr
!pip install transformers
!pip install datasets
!pip install tokenizers
!pip install evaluate
!pip install --upgrade accelerate

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting py7zr
  Downloading py7zr-0.20.5-py3-none-any.whl (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.4/66.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting pycryptodomex>=3.6.6 (from py7zr)
  Downloading pycryptodomex-3.18.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting pyzstd>=0.14.4 (from py7zr)
  Downloading pyzstd-0.15.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (412 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m412.3/412.3 kB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyppmd<1.1.0,>=0.18.1 (from py7zr)
  Downloading pyppmd-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (138 

In [2]:
!python -c "import nltk; nltk.download('punkt');"

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
%cp -av /kaggle/input/season-samsum/* /kaggle/working/

'/kaggle/input/season-samsum/model.py' -> '/kaggle/working/model.py'
'/kaggle/input/season-samsum/run.py' -> '/kaggle/working/run.py'
'/kaggle/input/season-samsum/trainer.py' -> '/kaggle/working/trainer.py'


In [4]:
mkdir data

In [5]:
import json
from typing import Any
from functools import partial

import datasets
from transformers import PreTrainedTokenizerBase
from nltk import sent_tokenize
from rouge_score import rouge_scorer
from transformers import AutoTokenizer

dataset = datasets.load_dataset('samsum', split=["train[:5000]","validation[:625]","test[:625]"])
dataset = datasets.DatasetDict({'train':dataset[0],'validation':dataset[1],'test':dataset[2]})
src_text_column_name, tgt_text_column_name = "dialogue", "summary"
max_source_length, max_target_length = 512, 128
n_proc = 40

# Since bos_token is used as the beginning of target sequence,
# we use mask_token to represent the beginning of each sentence.
bosent_token = "<mask>"
bosent_token_id = 50264

rouge_scorer = rouge_scorer.RougeScorer(['rougeLsum'], use_stemmer=True)


def convert_to_features(
        examples: Any,
        tokenizer: PreTrainedTokenizerBase,
        padding: str,
        max_source_length: int,
        max_target_length: int,
        src_text_column_name: str,
        tgt_text_column_name: str,
):
    inputs, targets = [], []
    all_sent_rouge_scores = []
    for i in range(len(examples[src_text_column_name])):
        if examples[src_text_column_name][i] is not None and examples[tgt_text_column_name][i] is not None:
            input_sentences = sent_tokenize(examples[src_text_column_name][i])
            target_sentences = examples[tgt_text_column_name][i].strip()
            rouge_scores = []
            for sent in input_sentences:
                rouge_scores.append(rouge_scorer.score(target_sentences, sent)['rougeLsum'].fmeasure)
            # todo: add bos_token this way is unsafe
            inputs.append(bosent_token.join(input_sentences))
            targets.append(target_sentences.replace('\n', ' ').replace('  ', ' '))
            all_sent_rouge_scores.append(rouge_scores)
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # replace bos_token_id at the begining of document with bosent_token_id
    for i in range(len(model_inputs['input_ids'])):
        model_inputs['input_ids'][i][0] = bosent_token_id

    all_token_sent_id = []
    for sent_tokens in model_inputs['input_ids']:
        sid = -1
        token_sent_id = []
        for tid in sent_tokens:
            if tid == bosent_token_id:
                sid += 1
            if tid == tokenizer.eos_token_id or tid == tokenizer.pad_token_id:
                sid = -1
            token_sent_id.append(sid)
        all_token_sent_id.append(token_sent_id)

    all_token_info_dist = []
    all_sent_bos_idx = []
    for token_sent_id, sent_rouge_scores in zip(all_token_sent_id, all_sent_rouge_scores):
        sent_rouge_scores = sent_rouge_scores[:max(token_sent_id) + 1]  # truncation
        sent_bos_idx = []
        token_info_dist = []
        bos_idx = 0
        for sid in range(max(token_sent_id) + 1):
            tnum = token_sent_id.count(sid)
            sent_score = sent_rouge_scores[sid]
            token_info_dist.extend([sent_score for _ in range(tnum)])
            sent_bos_idx.extend([bos_idx for _ in range(tnum)])
            bos_idx += tnum
        token_info_dist.extend([-1 for _ in range(token_sent_id.count(-1))])
        all_token_info_dist.append(token_info_dist)
        sent_bos_idx.extend([0 for _ in range(token_sent_id.count(-1))])
        all_sent_bos_idx.append(sent_bos_idx)

    for i in range(len(all_token_sent_id)):
        for j in range(len(all_token_sent_id[i])):
            all_token_sent_id[i][j] += 1

    model_inputs['info_distribution'] = all_token_info_dist
    model_inputs['sentence_bos_index'] = all_sent_bos_idx
    model_inputs['sent_id'] = all_token_sent_id

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

convert_to_features = partial(
    convert_to_features,
    tokenizer=tokenizer,
    padding='max_length',
    max_source_length=max_source_length,
    max_target_length=max_target_length,
    src_text_column_name=src_text_column_name,
    tgt_text_column_name=tgt_text_column_name,
)
dataset = dataset.map(
    convert_to_features,
    batched=True,
    num_proc=n_proc,
)

cols_to_keep = ["input_ids", "attention_mask", "labels", "info_distribution", "sentence_bos_index", "sent_id"]
dataset.set_format(columns=cols_to_keep)

for split in ['train', 'validation', 'test']:
    with open(f'data/{split}.json', 'w') as outfile:
        for i, example in enumerate(dataset[split]):
            json_string = json.dumps(example)
            outfile.write(json_string + '\n')




Downloading builder script:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/770 [00:00<?, ?B/s]

Downloading and preparing dataset samsum/samsum (download: 2.81 MiB, generated: 10.04 MiB, post-processed: Unknown size, total: 12.85 MiB) to /root/.cache/huggingface/datasets/samsum/samsum/0.0.0/3f7dba43be72ab10ca66a2e0f8547b3590e96c2bd9f2cbb1f6bb1ec1f1488ba6...


Downloading data: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

Dataset samsum downloaded and prepared to /root/.cache/huggingface/datasets/samsum/samsum/0.0.0/3f7dba43be72ab10ca66a2e0f8547b3590e96c2bd9f2cbb1f6bb1ec1f1488ba6. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

                                         

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#10:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#11:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#12:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#13:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#14:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#15:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#16:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#17:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#18:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#19:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#20:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#21:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#22:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#23:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#24:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#25:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#26:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#27:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#28:   0%|          | 0/1 [00:00<?, ?ba/s]

 



#29:   0%|          | 0/1 [00:00<?, ?ba/s]

 



#30:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#31:   0%|          | 0/1 [00:00<?, ?ba/s]

 



#32:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#33:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#34:   0%|          | 0/1 [00:00<?, ?ba/s]



 

#35:   0%|          | 0/1 [00:00<?, ?ba/s]



 

#36:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#37:   0%|          | 0/1 [00:00<?, ?ba/s]



 



#38:   0%|          | 0/1 [00:00<?, ?ba/s]



 

#39:   0%|          | 0/1 [00:00<?, ?ba/s]



                                          

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]



 



#5:   0%|          | 0/1 [00:00<?, ?ba/s]

 



#6:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#7:   0%|          | 0/1 [00:00<?, ?ba/s]



  



#9:   0%|          | 0/1 [00:00<?, ?ba/s]

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

 



#10:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#11:   0%|          | 0/1 [00:00<?, ?ba/s]



  

#12:   0%|          | 0/1 [00:00<?, ?ba/s]

#13:   0%|          | 0/1 [00:00<?, ?ba/s]

 



#14:   0%|          | 0/1 [00:00<?, ?ba/s]



 

#15:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#16:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#17:   0%|          | 0/1 [00:00<?, ?ba/s]



#18:   0%|          | 0/1 [00:00<?, ?ba/s]



 

#19:   0%|          | 0/1 [00:00<?, ?ba/s]

 



#20:   0%|          | 0/1 [00:00<?, ?ba/s]



 



#21:   0%|          | 0/1 [00:00<?, ?ba/s]

 



#22:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#23:   0%|          | 0/1 [00:00<?, ?ba/s]



 

#24:   0%|          | 0/1 [00:00<?, ?ba/s]



  

#26:   0%|          | 0/1 [00:00<?, ?ba/s]

#25:   0%|          | 0/1 [00:00<?, ?ba/s]



 

#27:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#28:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#29:   0%|          | 0/1 [00:00<?, ?ba/s]

 



#30:   0%|          | 0/1 [00:00<?, ?ba/s]

 



#31:   0%|          | 0/1 [00:00<?, ?ba/s]



 

#32:   0%|          | 0/1 [00:00<?, ?ba/s]



 

#33:   0%|          | 0/1 [00:00<?, ?ba/s]

 



#34:   0%|          | 0/1 [00:00<?, ?ba/s]



 

#35:   0%|          | 0/1 [00:00<?, ?ba/s]

 



#36:   0%|          | 0/1 [00:00<?, ?ba/s]



 



#37:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#38:   0%|          | 0/1 [00:00<?, ?ba/s]

 



#39:   0%|          | 0/1 [00:00<?, ?ba/s]



                                         

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 



#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

 



#6:   0%|          | 0/1 [00:00<?, ?ba/s]

 



#7:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#8:   0%|          | 0/1 [00:00<?, ?ba/s]



 

#9:   0%|          | 0/1 [00:00<?, ?ba/s]



 

#10:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#11:   0%|          | 0/1 [00:00<?, ?ba/s]

 



#12:   0%|          | 0/1 [00:00<?, ?ba/s]

 



#13:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#14:   0%|          | 0/1 [00:00<?, ?ba/s]



 

#15:   0%|          | 0/1 [00:00<?, ?ba/s]



 

#16:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#17:   0%|          | 0/1 [00:00<?, ?ba/s]

#18:   0%|          | 0/1 [00:00<?, ?ba/s]



 



#19:   0%|          | 0/1 [00:00<?, ?ba/s]



 

#20:   0%|          | 0/1 [00:00<?, ?ba/s]



 

#21:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#22:   0%|          | 0/1 [00:00<?, ?ba/s]



 

#23:   0%|          | 0/1 [00:00<?, ?ba/s]

 



 



#24:   0%|          | 0/1 [00:00<?, ?ba/s]



#25:   0%|          | 0/1 [00:00<?, ?ba/s]

 



#26:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#27:   0%|          | 0/1 [00:00<?, ?ba/s]

 



#28:   0%|          | 0/1 [00:00<?, ?ba/s]



 



#29:   0%|          | 0/1 [00:00<?, ?ba/s]



  

#31:   0%|          | 0/1 [00:00<?, ?ba/s]

#30:   0%|          | 0/1 [00:00<?, ?ba/s]



 



#32:   0%|          | 0/1 [00:00<?, ?ba/s]

 



#33:   0%|          | 0/1 [00:00<?, ?ba/s]



  

#35:   0%|          | 0/1 [00:00<?, ?ba/s]

#34:   0%|          | 0/1 [00:00<?, ?ba/s]

 



#36:   0%|          | 0/1 [00:00<?, ?ba/s]



 



#37:   0%|          | 0/1 [00:00<?, ?ba/s]



 

#38:   0%|          | 0/1 [00:00<?, ?ba/s]



 

#39:   0%|          | 0/1 [00:00<?, ?ba/s]



In [6]:
!python run.py \
    --model_name_or_path facebook/bart-large-cnn \
    --do_train \
    --do_predict \
    --train_file data/train.json \
    --validation_file data/validation.json \
    --test_file data/test.json \
    --output_dir outputs/train \
    --per_device_train_batch_size 4 \
    --gradient_accumulation_steps 2 \
    --per_device_eval_batch_size 4 \
    --num_train_epochs 10 \
    --learning_rate 1.3214238628872776e-05 \
    --warmup_steps 1500 \
    --weight_decay 0.01 \
    --max_grad_norm 0.1 \
    --metric_for_best_model rougeLsum \
    --evaluation_strategy epoch \
    --save_strategy epoch \
    --save_total_limit 2\
    --load_best_model_at_end True\
    --fp16 true \
    --bosent_token_id 50264 \
    --encoder_loss_ratio 1.0 \
    --encoder_label_smoothing 0.1 \
    --encoder_label_smoothing_type adjacent \
    --lower_saliency_threshold 0.125 \
    --higher_saliency_threshold 0.230 \
    --marginal_distribution true \
    --marginal_temperature 0.5 \
    --num_beams 5 \
    --max_length 128 \
    --min_length 20 \
    --length_penalty 1.5 \
    --no_repeat_ngram_size 3 \
    --overwrite_output_dir \
    --predict_with_generate


caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-70fba48d2b96861b/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...
Downloading data files: 100%|███████████████████| 3/3 [00:00<00:00, 9078.58it/s]
Extracting data files: 100%|████████████████████| 3/3 [00:00<00:00, 1441.18it/s]
Dataset json downloaded and prepared to /root/.cache/huggingface/d

In [1]:
%mkdir season_samsum

mkdir: cannot create directory ‘season_samsum’: File exists


In [3]:
%cp -av /kaggle/working/outputs/train/pytorch_model.bin season_samsum
%cp -av /kaggle/working/outputs/train/config.json season_samsum

'/kaggle/working/outputs/train/pytorch_model.bin' -> 'season_samsum/pytorch_model.bin'
'/kaggle/working/outputs/train/config.json' -> 'season_samsum/config.json'


In [4]:
!zip -r season_samsum.zip season_samsum

  adding: season_samsum/ (stored 0%)
  adding: season_samsum/config.json (deflated 61%)
  adding: season_samsum/pytorch_model.bin (deflated 7%)


In [5]:
from IPython.display import FileLink
FileLink(r'season_samsum.zip')
