In [None]:
%pip install transformers==4.28.1
%pip install datasets
%pip install sentencepiece
%pip install rouge_score
%pip install tabulate

In [2]:
import numpy as np
import pandas as pd
import re
import os
import sys
import json 
import ast
import pickle

sys.path.append('../../Quantlet/Create_description/')

import importlib
import preprocessing_utils
importlib.reload(preprocessing_utils)

import torch
import numpy as np
import datasets

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)

import nltk
from datetime import datetime

import evaluate
nltk.download("punkt", quiet=True)


from datasets import Dataset
from datasets import load_dataset


import evaluate
metric = evaluate.load("rouge")

from sklearn.model_selection import train_test_split, KFold

os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"]="0, 1"


RS = 42

In [3]:
# LOAD created descriptions and the parsed Quantlets
parsed_Qs_file = '../../data/preprocessed/Quantlet/Parsed_Qs_19032023.pkl'
dataset = preprocessing_utils.QuantletDataset(parsed_Qs_file)

with open('../../data/preprocessed/Quantlet/Descriptions_Qs_19032023.pkl', 'rb') as f:
    descriptions = pickle.load(f)

# Identify the most common language in each folder containing metainfo file
dataset = dataset.parsed_Qs_file
dataset.type_script = dataset.type_script.str.replace('ipynb', 'py')
dataset['most_commen_lang'] = dataset.folder_name.map(dataset.groupby(['folder_name'])['type_script'].agg(pd.Series.mode))
dataset['most_commen_lang'] = dataset['most_commen_lang'].astype(str)

# create the additional index to merge discription back to the dataset
dataset['desc_idx'] = dataset.index

In [5]:
unique_metainfo_files = dataset.metainfo_file.astype(str).unique()

tocheck_data = dataset.copy()
tocheck_data.metainfo_file = tocheck_data.metainfo_file.astype(str)
tocheck_data['empty'] = (tocheck_data.metainfo_file=='empty').astype(int)
tocheck_data['empty_ratio'] = tocheck_data.folder_name.map(tocheck_data.groupby('folder_name').empty.sum()) / tocheck_data.folder_name.map(tocheck_data.groupby('folder_name').empty.count())

# identify the repos that do not have metainfo files at all
q_no_meta = tocheck_data[tocheck_data.empty_ratio==1].folder_name.unique()

In [6]:
# remove duplicates
Q_lang = dataset[['folder_name', 'most_commen_lang']].drop_duplicates().reset_index(drop=True)
Q_lang = Q_lang[~Q_lang.folder_name.isin(q_no_meta)]

In [7]:
# create the train and test set
labelled, test = train_test_split(Q_lang, test_size=0.2, random_state=RS, stratify=Q_lang.most_commen_lang)
print(labelled.shape, test.shape)

(2376, 2) (595, 2)


In [8]:
def get_metainfo_field(row, field='keywords'):
    try :
        row = ast.literal_eval(row)
    except:
        return 'empty'
    row = {k.lower():v for (k,v) in row.items()}
    if field in row.keys():
        return row[field]
    else:
        return 'empty'
    
def set_keywords_to_list(row):
    if row=='empty':
        return []
    elif isinstance(row, list):
        return row
    else:
        return [x.strip() for x in row.split(',')]
    
def get_keywords_len(row):
    if row=='empty':
        return 0
    else:
        return len(row)

In [9]:
# extract keywords from metainfo file
dataset['keywords'] = dataset.metainfo_file.apply(get_metainfo_field)

# remove with empty keywords
dataset = dataset[~dataset.keywords.isna()]

# keywords to list
dataset['keywords'] = dataset.keywords.apply(set_keywords_to_list)

In [11]:
mapping = {'cryptocurrency' : ['crypto', 'crpytocurrencies', 'cryptocurrencies'], 
           'visualization' : ['data visualization', 'plot', 'plotting', 'graphical representation', 'data-visualization', 'visualisation', 'data visualisation'],
           'machine learning' : ['ml', 'statistical learning'],
           'deep learning' : ['dl', 'ai', 'artificial intelligence', 'neural network', 'neural networks', 'neural-network', 'neural-networks'],
           'nlp' : ['natural language processing', 'nlp', 'textual analysis', 'text'],
           'web scraping': ['scraping', 'crawler', 'crawling', 'web crawler', 'web crawling'],
           'hacking' : ['hack'],
           'principal component analysis' : ['principal component', 'pca'],
            'time series' : ['ts', 'time-series', 'timeseries'],
            'random forest' : ['rf', 'random forests'], }
mapping_back = {v:k for k, v_list in mapping.items() for v in v_list}

In [12]:
freq_keywords = pd.read_csv('../../data/preprocessed/Quantlet/keywords_higher30.csv')
freq_keywords = freq_keywords.keyword.values

In [13]:
dataset['keywords'] = dataset['keywords'].apply(lambda x: [mapping_back.get(keyword.lower(), keyword.lower()) for keyword in x])
dataset['keywords'] = dataset['keywords'].apply(lambda x: list(set(x)))
dataset['keywords'] = dataset['keywords'].apply(lambda x: [keyword for keyword in x if keyword in freq_keywords])
dataset['keywords'] = dataset['keywords'].apply(lambda x: sorted(x))

In [14]:
# preprocess keywords
dataset['keywords_n'] = dataset['keywords'].apply(get_keywords_len)

In [15]:
no_meta_ds = dataset.query('folder_name in @q_no_meta')
labelled_ds = dataset.query('folder_name in @labelled.folder_name')
test_ds = dataset.query('folder_name in @test.folder_name')

print(labelled_ds.shape, test_ds.shape)
full_shape = labelled_ds.shape[0] + test_ds.shape[0]
print(labelled_ds.shape[0] / full_shape, test_ds.shape[0]/full_shape)


no_meta_ds = no_meta_ds

(3863, 9) (968, 9)
0.7996274063340923 0.20037259366590768


In [16]:
labelled_ds = labelled_ds[labelled_ds.keywords.apply(len)>0]
test_ds = test_ds[test_ds.keywords.apply(len)>0]

labelled_ds.keywords = labelled_ds.keywords.apply(lambda x : '; '.join(x))
test_ds.keywords = test_ds.keywords.apply(lambda x : '; '.join(x))

#labelled_ds = labelled_ds[labelled_ds.description.apply(len)>0]
#test_ds = test_ds[test_ds.description.apply(len)>0]

#labelled_ds.description = labelled_ds.description.apply(lambda x : '; '.join(x))
#test_ds.description = test_ds.description.apply(lambda x : '; '.join(x))

In [17]:
# extract descriptions for train, test, no meta datasets
descriptions_labelled = [descriptions[i] for i in labelled_ds.desc_idx.values]
descriptions_test = [descriptions[i] for i in test_ds.desc_idx.values]
descriptions_no_meta = [descriptions[i] for i in no_meta_ds.desc_idx.values]

In [18]:
# Extract description lists
descriptions_labelled = [list(descr_dict.values()) for descr_dict in descriptions_labelled]
descriptions_test = [list(descr_dict.values()) for descr_dict in descriptions_test]
descriptions_no_meta = [list(descr_dict.values()) for descr_dict in descriptions_no_meta]

descriptions_labelled_list = [' \n '.join(descr[0]) for descr in descriptions_labelled]
descriptions_test_list = [' \n '.join(descr[0]) for descr in descriptions_test]

In [19]:
train_dataset_json = {'version' : '0.1.0',
                     'data' : [{'description': descriptions_labelled_list[i], 
                                'keywords' : labelled_ds.iloc[i]['keywords']} for i in range(len(descriptions_labelled_list))]}

test_dataset_json = {'version' : '0.1.0',
                     'data' : [{'description': descriptions_test_list[i], 
                                'keywords' : test_ds.iloc[i]['keywords']} for i in range(len(descriptions_test_list))]}


with open('../../data/preprocessed/Quantlet/labelled_dataset.json', 'w') as f:
    json.dump(train_dataset_json, f)

with open('../../data/preprocessed/Quantlet/test_dataset.json', 'w') as f:
    json.dump(test_dataset_json, f)

In [20]:
train_dataset = load_dataset("json", data_files="../../data/preprocessed/Quantlet/labelled_dataset.json", field="data")
test_dataset = load_dataset("json", data_files="../../data/preprocessed/Quantlet/test_dataset.json", field="data")

Downloading and preparing dataset json/default to /home/RDC/zinovyee.hub/.cache/huggingface/datasets/json/default-c68c24d2b856788f/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/RDC/zinovyee.hub/.cache/huggingface/datasets/json/default-c68c24d2b856788f/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Downloading and preparing dataset json/default to /home/RDC/zinovyee.hub/.cache/huggingface/datasets/json/default-5eff949ce8d8743d/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/RDC/zinovyee.hub/.cache/huggingface/datasets/json/default-5eff949ce8d8743d/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [21]:
model_name = "sshleifer/distilbart-xsum-12-3"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# tokenization
encoder_max_length = 512  # demo
decoder_max_length = 26

train_data_txt, validation_data_txt = train_dataset['train'].train_test_split(test_size=0.1).values()

In [22]:
a = [len(desc) for desc in train_data_txt['description']]
empty_descr_idx = np.where(np.array(a) == 0)


b = [len(desc) for desc in validation_data_txt['description']]
empty_descr_val_idx = np.where(np.array(b) == 0)

In [23]:
# create new dataset exluding those idx
train_data_txt = train_data_txt.select(
    (
        i for i in range(len(train_data_txt)) 
        if i not in set(empty_descr_idx[0])
    )
)

# create new dataset exluding those idx
validation_data_txt = validation_data_txt.select(
    (
        i for i in range(len(validation_data_txt)) 
        if i not in set(empty_descr_val_idx[0])
    )
)



In [24]:
def batch_tokenize_preprocess(batch, tokenizer, max_source_length, max_target_length):
    source, target = batch["description"], batch["keywords"]
    source_tokenized = tokenizer(
        source, padding="max_length", truncation=True, max_length=max_source_length
    )
    target_tokenized = tokenizer(
        target, padding="max_length", truncation=True, max_length=max_target_length
    )

    batch = {k: v for k, v in source_tokenized.items()}
    # Ignore padding in the loss
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized["input_ids"]
    ]
    return batch


train_data = train_data_txt.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=train_data_txt.column_names,
)

validation_data = validation_data_txt.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=validation_data_txt.column_names,
)

Map:   0%|          | 0/3009 [00:00<?, ? examples/s]

Map:   0%|          | 0/335 [00:00<?, ? examples/s]

In [25]:

def postprocess_text(preds, labels):

    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

In [26]:

def compute_metrics(eval_preds):
    
    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    
    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )

    # Extract a few results from ROUGE
    result = {key: value * 100 for key, value in result.items()}

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [27]:
training_args = Seq2SeqTrainingArguments(
    output_dir="results",
    num_train_epochs=5,  # demo
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=16,  # demo
    per_device_eval_batch_size=16,
    # learning_rate=3e-05,
    warmup_steps=500,
    weight_decay=0.1,
    label_smoothing_factor=0.1,
    predict_with_generate=True,
    logging_dir="logs",
    logging_steps=100,
    save_total_limit=3,
    report_to=None
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data,
    eval_dataset=validation_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [28]:
trainer.evaluate()

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




{'eval_loss': 9.648341178894043,
 'eval_rouge1': 0.2321,
 'eval_rouge2': 0.0,
 'eval_rougeL': 0.2052,
 'eval_rougeLsum': 0.2057,
 'eval_gen_len': 18.2866,
 'eval_runtime': 26.5313,
 'eval_samples_per_second': 12.627,
 'eval_steps_per_second': 0.415}

In [29]:
trainer.train()



Step,Training Loss
100,6.0411
200,3.1537
300,2.5482
400,2.2719


TrainOutput(global_step=475, training_loss=3.2890943989000823, metrics={'train_runtime': 378.7514, 'train_samples_per_second': 39.723, 'train_steps_per_second': 1.254, 'total_flos': 9315230810112000.0, 'train_loss': 3.2890943989000823, 'epoch': 5.0})

In [30]:
def generate_summary(test_samples, model):
    inputs = tokenizer(
        test_samples["description"],
        padding="max_length",
        truncation=True,
        max_length=encoder_max_length,
        return_tensors="pt",
    )
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)
    outputs = model.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return outputs, output_str


model_before_tuning = AutoModelForSeq2SeqLM.from_pretrained(model_name)

test_samples = validation_data_txt.select(range(20))

summaries_before_tuning = generate_summary(test_samples, model_before_tuning)[1]
summaries_after_tuning = generate_summary(test_samples, model)[1]



In [31]:
print(
    
        list(zip(
            range(len(summaries_after_tuning)),
            summaries_after_tuning,
            summaries_before_tuning,
        ))
    
)
print("\nTarget summaries:\n")
print(
    list(enumerate(test_samples["keywords"])))


[(0, 'loss function; pareto; pdf; visualization', " A look at some of the key questions in this week's English language language."), (1, 'machine learning; model; prediction; simulation; visualization', ' A look at some of the key issues faced by researchers in Python.'), (2, '3d; poisson process; price; visualization', ' Researchers at Abertawe Bro Morgan University in Cardiff have published a guide to a new type of computer programming language called NHPPALP.'), (3, 'boxplot; mean; parameter; scatterplot; visualization', ' Here is a guide to the key points on the follow-up to a paper on the USCRIME data set.'), (4, '3d; poisson process; price; visualization', ' The following is a guide to some of the key questions in this article.'), (5, 'crix; cryptocurrency; deep learning; lstm', ' The following is a guide to some of the key questions being asked by researchers at the University of Cambridge.'), (6, 'crix; cryptocurrency; deep learning; time series', ' Here is a guide to some of t

In [32]:
test_samples["keywords"]

['empirical; loss function; model; pareto; visualization',
 'pacf',
 '3d; poisson process; price',
 'empirical; histogram; parameter; visualization',
 'poisson process; price; visualization',
 'crix; cryptocurrency',
 'bitcoin; cryptocurrency',
 'copula; gumbel; visualization',
 'kernel; regression; visualization',
 'classification; machine learning; prediction; visualization',
 'cryptocurrency',
 'cdf; normal; random',
 'bandwidth; black-scholes; implied-volatility; option-price',
 'visualization',
 'density; gaussian; pdf; visualization',
 'poisson process; price; visualization',
 'python',
 'text mining; time series; visualization',
 'cdf; distribution; gumbel; random; visualization',
 'lasso; portfolio; quantile regression; var']