In [1]:
#%pip install protobuf==3.20.1
%pip install transformers[torch]
%pip install -q sentencepiece
%pip install datasets==2.13.1
%pip install evaluate
%pip install rouge_score



In [2]:
QPATH = "Quantlet/code_description2project_description"

In [3]:
import sys
IN_COLAB = 'google.colab' in sys.modules

import os
if IN_COLAB:
  os.chdir(f'/content/drive/MyDrive/ColabNotebooks/IRTG/Encode_the_Qode/Encode-the-Qode/{QPATH}')

sys.path.append('../src')

In [27]:
import pickle
import json
import re
import sys
from IPython.display import display

from tqdm import tqdm
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns

import torch
from torch.utils.data import  DataLoader
from transformers import AutoTokenizer, AutoModelWithLMHead, SummarizationPipeline
from transformers import AdamW
from datasets import load_dataset

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)

import nltk
nltk.download('punkt')
import evaluate

import importlib
import preprocessing_utils
importlib.reload(preprocessing_utils)

from sklearn.model_selection import train_test_split

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
with open('../../data/preprocessed/Quantlet/Parsed_Qs_with_code_25062023.pkl', 'rb') as file:
  df = pickle.load(file)

CLEAN_UP = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [6]:
df = df[df.metainfo_file!='empty']
print(df.shape)

(4856, 6)


In [7]:
def parse_meta(row):
    row = row['metainfo_file']
    if row=='empty':
        return ['','','','']
    dict_keys = list(row.keys())
    dict_key_n = [k.lower() for k in dict_keys]
    name_idx = np.where(['name' in k for k in dict_key_n])[0]
    desc_idx = np.where(['desc' in k for k in dict_key_n])[0]
    key_idx = np.where(['keyw' in k for k in dict_key_n])[0]

    dict_keys_used = []

    if len(name_idx) > 0:
        name = row[dict_keys[name_idx[0]]]
        dict_keys_used.append(name)
    else:
        name = ''
    if len(desc_idx) > 0:
        desc = row[dict_keys[desc_idx[0]]]
        dict_keys_used.append(desc)
    else:
        desc = ''
    if len(key_idx) > 0:
        key = row[dict_keys[key_idx[0]]]
        dict_keys_used.append(key)
    else:
        key = ''
    other = {k: row[k] for k in dict_keys if k not in dict_keys_used}
    return [name, desc, key, other]

In [8]:
# Parse metainfo file
if 'Keywords' not in df.columns:
  meta_info = pd.DataFrame(columns=['Quantlet', 'Description', 'Keywords', 'Other'])

  meta_info[['Quantlet', 'Description', 'Keywords', 'Other']] = df.apply(
      lambda x: parse_meta(x),
      axis='columns',
      result_type='expand'
      )

  for col in meta_info.columns:
      meta_info[col] = meta_info[col].astype(str)

  df = pd.concat([df, meta_info], axis=1)

  del df['metainfo_file']
  del df['Other']
  del df['script_name']
  del df['script_name_no_ext']

In [9]:
df['multiple_scripts'] = df['code_script'].apply(lambda x: any(isinstance(i, list) for i in x))
df['code_script_joined'] = ''
df.loc[df['multiple_scripts']==True, 'code_script_joined'] = df.loc[df['multiple_scripts']==True, 'code_script'].apply(lambda x: [''.join(code_script) for code_script in x])
df.loc[df['multiple_scripts']!=True, 'code_script_joined'] = df.loc[df['multiple_scripts']!=True, 'code_script'].apply(lambda x: [''.join(x)])
df['scr_n'] = df['code_script_joined'].apply(len)

In [10]:
# Create one dataset, each script separate, with the same description and keywords

df_long = df.explode('code_script_joined')
# Create one dataset, one metainfo file, all scripts together


In [11]:
print(df.shape)
print(df_long.shape)

(4856, 9)
(6743, 9)


In [35]:
model_name = "SEBIS/code_trans_t5_large_source_code_summarization_python_multitask"

In [12]:
model=AutoModelWithLMHead.from_pretrained(model_name)
tokenizer=AutoTokenizer.from_pretrained(model_name, skip_special_tokens=False)
model.to(device)
print(device)



cuda


In [13]:
# tokenization
encoder_max_length = 512
decoder_max_length = 26

In [14]:
train_qs, test_qs = train_test_split(list(df.Quantlet.unique()),
                                     test_size=0.1,
                                     random_state=42)

train = df[df['Quantlet'].isin(set(train_qs))]
test = df[df['Quantlet'].isin(set(test_qs))]

In [15]:
train_long = df_long[df_long['Quantlet'].isin(set(train_qs))].head(1000)
test_long = df_long[df_long['Quantlet'].isin(set(test_qs))]

In [16]:
train_dataset_json = {'version' : '0.1.0',
                     'data' : [{'input_sequence': train_long['code_script_joined'].iloc[i],
                                'output_sequence' : train_long['Description'].iloc[i]} for i in range(train_long.shape[0])]}

test_dataset_json = {'version' : '0.1.0',
                     'data' : [{'input_sequence': test_long['code_script_joined'].iloc[i],
                                'output_sequence' : test_long['Description'].iloc[i]} for i in range(test_long.shape[0])]}


with open('labelled_dataset_descr.json', 'w') as f:
    json.dump(train_dataset_json, f)

with open('test_dataset_descr.json', 'w') as f:
    json.dump(test_dataset_json, f)

In [17]:
def batch_tokenize_preprocess(batch,
                              tokenizer,
                              max_source_length,
                              max_target_length):

    source = batch["input_sequence"]
    target = batch["output_sequence"]

    source_tokenized = tokenizer(
        source,
        padding="max_length",
        truncation=True,
        max_length=max_source_length
    )

    target_tokenized = tokenizer(
        target,
        padding="max_length",
        truncation=True,
        max_length=max_target_length
    )

    batch = {k: v for k, v in source_tokenized.items()}

    # Ignore padding in the loss

    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized["input_ids"]
    ]

    return batch

In [18]:
train_dataset = load_dataset("json", data_files="labelled_dataset_descr.json", field="data")
test_dataset = load_dataset("json", data_files="test_dataset_descr.json", field="data")

Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-3969a96a4492c4db/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-3969a96a4492c4db/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-2b9948755a70816a/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-2b9948755a70816a/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [19]:
train_data_txt = train_dataset['train']
validation_data_txt = test_dataset['train']

In [20]:
train_data = train_data_txt.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=train_data_txt.column_names,
)

validation_data = validation_data_txt.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=validation_data_txt.column_names,
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/706 [00:00<?, ? examples/s]

In [21]:
def postprocess_text(preds, labels):

    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

In [51]:
def compute_metrics(eval_preds, metrics_list=['rouge', 'bleu']):

    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # POST PROCESSING
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    results_dict = {}
    for metric in metrics_list:
        metric = evaluate.load(metric)

        if metric=='bleu':
          result = metric.compute(
            predictions=decoded_preds, references=decoded_labels
        )
        else:
          result = metric.compute(
              predictions=decoded_preds, references=decoded_labels, use_stemmer=True
          )

        result = {key: value * 100 for key, value in result.items()}

        prediction_lens = [
            np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
        ]
        result["gen_len"] = np.mean(prediction_lens)
        result = {k: round(v, 4) for k, v in result.items()}
        results_dict.update(result)
    return results_dict

In [54]:
compute_metrics

<function __main__.compute_metrics(eval_preds, metrics_list=['rouge', 'bleu'])>

In [52]:
training_args = Seq2SeqTrainingArguments(
    output_dir="results",
    num_train_epochs=5,  # demo
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=4,  # demo
    per_device_eval_batch_size=16,
    # learning_rate=3e-05,
    warmup_steps=500,
    weight_decay=0.1,
    label_smoothing_factor=0.1,
    predict_with_generate=True,
    logging_dir="logs",
    logging_steps=100,
    save_total_limit=3,
    report_to=None
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data,
    eval_dataset=validation_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.evaluate()

In [33]:
trainer.train()

Step,Training Loss
100,6.4849
200,5.203
300,4.1687
400,3.5527
500,3.1171
600,2.6568
700,2.5512
800,2.2937
900,2.1285
1000,2.0614


TrainOutput(global_step=1250, training_loss=3.117785028076172, metrics={'train_runtime': 607.4897, 'train_samples_per_second': 8.231, 'train_steps_per_second': 2.058, 'total_flos': 1.082523648e+16, 'train_loss': 3.117785028076172, 'epoch': 5.0})

In [37]:
def generate_summary(test_samples, model):
    inputs = tokenizer(
        test_samples["input_sequence"],
        padding="max_length",
        truncation=True,
        max_length=encoder_max_length,
        return_tensors="pt",
    )
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)
    outputs = model.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return outputs, output_str


model_before_tuning = AutoModelForSeq2SeqLM.from_pretrained(model_name)

test_samples = validation_data_txt.select(range(20))

summaries_before_tuning = generate_summary(test_samples, model_before_tuning)[1]
summaries_after_tuning = generate_summary(test_samples, model)[1]



In [None]:
for i, description in enumerate(test_samples["output_sequence"]):
  print('_'*10)
  print(f'Original: {description}')
  print(f'Summary before Tuning: {summaries_before_tuning[i]}')
  print(f'Summary after Tuning: {summaries_after_tuning[i]}')
  print('_'*10)
  print('\n')