In [1]:
import torch
print(f'Torch Device Name: {torch.cuda.get_device_name()}')
!nvidia-smi

  from .autonotebook import tqdm as notebook_tqdm


Torch Device Name: NVIDIA GeForce RTX 3060
Sun Sep 11 18:32:37 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.60.02    Driver Version: 512.15       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:04:00.0  On |                  N/A |
|  0%   48C    P8    13W / 170W |    646MiB / 12288MiB |      4%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+--------------------------------------------

# Import Packages

In [2]:
import os
import re
import nltk
import random
import pickle
import pathlib
import numpy as np
import pandas as pd
from pynvml import *
from string import punctuation
from datasets import Dataset, DatasetDict, load_metric
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/rahul/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [4]:
print_gpu_utilization()

GPU memory occupied: 773 MB.


In [5]:
current_dir = pathlib.Path().resolve()

In [6]:
dataset_path = os.path.join(current_dir, 'data/text_keyword_dataset.gzip')

# Save the data-indices of multiple batches as PKL file in Google Drive as you don't lose it later
data_indices_path = os.path.join(str(current_dir), 'utils/batch_indices.pkl')

In [7]:
dataset = pd.read_parquet(dataset_path)
print(f'Total Dataset size: {len(dataset)}')

Total Dataset size: 298311


In [8]:
dataset.head(n=3)

Unnamed: 0,text,keyword
0,Autonomous resource provisioning for multi-ser...,multi-service application\nresource provisioni...
1,Collaborative filtering for orkut communities:...,association rule mining\ncollaborative filteri...
2,A trust management framework for service-orien...,distributed systems\nreputation\nsecurity and ...


# Preprocess Dataset

In [9]:
def get_word_count(text:str): return len(text.split())
def remove_newlines(text:str): return re.sub(r'\n', ';', text) if isinstance(text, str) else ''
def remove_tabs(text: str): return re.sub(r'\t', ' ', text) if isinstance(text, str) else ''

In [10]:
# Some keywords have newline character and some don't. We rather remove newline character from all the keyword sentences

dataset['word_count'] = dataset['text'].apply(get_word_count)
dataset['text'] = dataset['text'].apply(remove_newlines)
dataset['text'] = dataset['text'].apply(remove_tabs)
dataset['keyword'] = dataset['keyword'].apply(remove_newlines)

In [11]:
# There are many sentences which are less than 25 words. This is not helpful to generate accuracy keywords as we don't have much context

indices_to_remove = list(dataset[dataset['word_count']<=25].index)
dataset.drop(indices_to_remove, inplace=True)
dataset.reset_index(drop=True, inplace=True)

In [12]:
def get_record(index:int, data:pd.DataFrame=dataset):
    record = data.iloc[index]
    return {'text': record['text'], 'keywords': record['keyword']}

In [13]:
get_record(1030)

{'text': "Verona Lastre: consolidation provides opening for a new plate vendor;Fewer companies than ever are manufacturing CTP plates. The market has become; globalized, with just four big firms dominating the picture. To the; Samor Group, however, globalization looked like an opportunity; it; reasoned that many a national and local distributor would welcome a; small, competitive, regional manufacturer. A couple of years ago it; formed a company, Verona Lastre, to exploit that opportunity. Now Vela,; as it's familiarly called, has launched its line of high-quality; thermal plates and is busily lining up dealers in Europe and the; Americas;",
 'keywords': 'Verona Lastre;Vela;CTP plates;computer controlled typesetting;printing industry;publishing;'}

In [14]:
dataset.head(n=3)

Unnamed: 0,text,keyword,word_count
0,Autonomous resource provisioning for multi-ser...,multi-service application;resource provisioning;,136
1,Collaborative filtering for orkut communities:...,association rule mining;collaborative filterin...,230
2,A trust management framework for service-orien...,distributed systems;reputation;security and pr...,156


We don't need complete dataset to train the model. Defragment the dataset into multiple batches of 2000 records. This helps in quickly training multiple variations of model.

# Batching & Transform Dataset

In [15]:
def generate_batch_indices(data_indices:list, batch_size:int=20_000, path:str=data_indices_path):
    random.shuffle(data_indices)
    batches = len(data_indices) // batch_size
    print(f'Number of batches: {batches}')
    x = []
    i, batch_len = 0, batch_size
    for index, _ in enumerate(range(batches)):
        x.append(data_indices[i: i+batch_len])
        i += batch_len
    with open(path, 'wb') as f:
        pickle.dump(x, f)

In [16]:
def load_pkl_data(pkl_obj_path):
    with open(pkl_obj_path, 'rb') as f:
        indices = pickle.load(f)
    return indices

In [17]:
# Danger: !Run this only once!
# generate_batch_indices(data_indices = list(dataset.index), batch_size=2000)

In [18]:
batch_indices = load_pkl_data(data_indices_path)

In [19]:
def get_dataset_processed(batch_num:int, batch_indices:list=batch_indices, data:pd.DataFrame=dataset):
    if batch_num > len(batch_indices):
        raise(f'No sufficient batches, Please reduce the batch number')
    record_indices = batch_indices[batch_num]
    sample_dataset = data.iloc[record_indices]
    
    train_set, test_set = train_test_split(sample_dataset, test_size=0.2, shuffle=True)
    train_set, valid_set = train_test_split(train_set, test_size=0.1, shuffle=True)

    train_dataset = Dataset.from_pandas(train_set)
    valid_dataset = Dataset.from_pandas(valid_set)
    test_dataset = Dataset.from_pandas(test_set)
    
    dataset = DatasetDict({
        'train': train_dataset,
        'valid': valid_dataset,
        'test': test_dataset
    })
    
    return dataset

# Model Definition

In [20]:
prefix = "keyword: "
max_input_length = 512
max_target_length = 20
batch_size = 8

# Finetune from total scratch
# model_checkpoint = "sshleifer/distilbart-cnn-12-6"
# Finetune from a check point
checkpoint_num = "1000"
model_checkpoint = os.path.join(current_dir, f'model_saves/checkpoint-{checkpoint_num}')

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to("cuda")
def model_init():
    return AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to("cuda")
data_collator = DataCollatorForSeq2Seq(tokenizer)
metric = load_metric("rouge")

# Tokenization

In [21]:
def clean_text(text):
    sentences = nltk.sent_tokenize(text.strip())
    sentences_cleaned = [s for sent in sentences for s in sent.split("\n")]
    sentences_cleaned_no_titles = [sent for sent in sentences_cleaned if len(sent) > 0 and sent[-1] in punctuation]
    text_cleaned = "\n".join(sentences_cleaned_no_titles)
    return text_cleaned

In [22]:
def preprocess_function(examples):
    texts_cleaned = [clean_text(text) for text in examples["text"]]
    inputs = [prefix + text for text in texts_cleaned]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["keyword"], max_length=max_target_length, truncation=True)
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [23]:
batch_number = 138
sample_dataset = get_dataset_processed(batch_num=batch_number)
sample_tokenized_datasets = sample_dataset.map(preprocess_function, batched=True)

100%|█████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.18ba/s]
100%|█████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.74ba/s]
100%|█████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.09ba/s]


# Training Args

In [24]:
args = Seq2SeqTrainingArguments(
    os.path.join(current_dir, f'model_saves/'),
    evaluation_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=200,
    learning_rate=4e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=30,
    predict_with_generate=True,
    fp16=True,
    gradient_accumulation_steps=5, 
    gradient_checkpointing=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to="tensorboard"
)

# Score computation

In [25]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) 
                      for label in decoded_labels]
    
    # Compute ROUGE scores
    result = metric.compute(predictions=decoded_preds, references=decoded_labels,
                            use_stemmer=True)

    # Extract ROUGE f1 scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length to metrics
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

# Training

In [26]:
trainer = Seq2SeqTrainer(
    model_init=model_init,
    args=args,
    train_dataset=sample_tokenized_datasets["train"],
    eval_dataset=sample_tokenized_datasets["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

loading configuration file /home/rahul/rahul-work/keyword_gen_project/model_saves/checkpoint-1000/config.json
Model config BartConfig {
  "_name_or_path": "/home/rahul/rahul-work/keyword_gen_project/model_saves/checkpoint-1000",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "extra_pos_embeddings": 2,
  "force_bos_token_to_be_generated": true,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 

In [27]:
result = trainer.train()

loading configuration file /home/rahul/rahul-work/keyword_gen_project/model_saves/checkpoint-1000/config.json
Model config BartConfig {
  "_name_or_path": "/home/rahul/rahul-work/keyword_gen_project/model_saves/checkpoint-1000",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "extra_pos_embeddings": 2,
  "force_bos_token_to_be_generated": true,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
100,0.9781,1.573347,34.8025,18.6953,28.2013,28.1952,57.9125
200,0.2586,1.827665,32.9626,17.3425,26.8688,27.0851,57.3562
300,0.0931,2.171056,33.3063,18.0956,27.3779,27.614,57.3062
400,0.0513,2.215236,32.3738,17.8059,26.3773,26.7564,57.375
500,0.0311,2.332826,32.561,17.1926,26.7166,26.872,57.2562
600,0.0235,2.366807,32.2143,17.344,26.2693,26.4955,57.15
700,0.0167,2.50713,32.9637,17.4254,27.4673,27.7801,57.25
800,0.0124,2.401115,33.7962,17.8802,27.7467,27.9392,57.1062
900,0.0089,2.429439,33.1779,17.2073,27.5448,27.7694,57.0938
1000,0.007,2.38886,33.0282,17.4764,27.0293,27.2383,57.1125


The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: word_count, text, __index_level_0__, keyword. If word_count, text, __index_level_0__, keyword are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 160
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: word_count, text, __index_level_0__, keyword. If word_count, text, __index_level_0__, keyword are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 160
  Batch size = 8
Saving model checkpoint to /home/rahul/rahul-work/keyword_gen_project/model_saves/checkpoint-200
Configuration saved in /home/rahul/rahul-work/keyword_gen_project/model_saves/checkpoint-200/

In [22]:
print_summary(result)

Time: 3349.75
Samples/second: 12.90
GPU memory occupied: 7934 MB.


In [30]:
# %load_ext tensorboard
# %tensorboard --logdir f'{os.path.join(current_dir,"/model_saves/runs")}'
# !kill 21574

# Inference

In [6]:
checkpoint_num = "800"
model_checkpoint = os.path.join(current_dir, f'model_saves/checkpoint-{checkpoint_num}')

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [7]:
def model_infernce(text, model, tokenizer) -> str:
    
    def defrag_sents(text:str, word_len:int=100):
        words = nltk.word_tokenize(text)
        results = []
        c = 0
        while c <= len(words):
            results.append(" ".join(words[c:c+word_len]))
            c += word_len
        return results
    
    sentences = defrag_sents(text)
    keywords = []
    
    for sent in sentences:
        text = re.sub("\n", " ", sent)
        inputs = ["keyword: " + text]
        inputs = tokenizer(inputs, max_length=512, truncation=True, return_tensors="pt")
        output = model.generate(**inputs, num_beams=8, do_sample=True, min_length=10, max_length=20)
        decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
        predicted_keyword = nltk.sent_tokenize(decoded_output.strip())[0]

        keywords.append(predicted_keyword)
    return ";".join(keywords)

In [23]:
text = """ Former senior Congress leader, Ghulam Nabi Azad said on Sunday that he has not promised restoration of Article 370 in his new political agenda because he doesn`t believe in making false promises. Addressing a public meeting in north Kashmir`s Baramulla town, Azad said, "To restore Article 370 would need around 350 votes in the Lok Sabha and 175 votes in the Rajya Sabha.

"This is a number no political party has or is likely to ever get. The Congress has shrunken to less than 50 seats and if they speak of restoring Article 370, they are making false promises."

He said his political agenda includes restoration of statehood, land and jobs for the locals as these are achievable objectives.

"Some people have blamed me for voting in favour of the Article 370 abrogation motion brought in by the home minister.

"I have voted against the abrogation and these people who have no idea about the working of Parliament are saying that I voted against Article 370," he said.

He said when he was the chief minister of the state, he arrested 13 police personnel for staging a fake encounter in which three persons were killed.

"The arrested persons are in jail for the last 15 years," he said.

He spoke of the developmental works and creation of districts during his tenure as the chief minister.

"Four new districts were created in the Valley and three in the Jammu division during my tenure as the chief minister. I got new medical colleges during that period.

"Whether I get four votes or lakhs of votes during the elections, I will never deceive the people," he assured the public gathering.

This was Azad`s first public meeting in Kashmir after he resigned from the basic membership of the Congress party."""

In [24]:
keywords = model_infernce(text, model, tokenizer)
keywords = list(set(keywords.split(';')))

input_words = nltk.word_tokenize(text)
for item in keywords:
    print(f'{item}: Present: {True if item in input_words else False}\n')

Politics and Government: Present: False

Azad Ghulam Nabi: Present: False

 Article 370: Present: False

: Present: False

Azad (India): Present: False

Kashmir and Jammu: Present: False

Land Use Act: Present: False

Congress (India: Present: False

Article 370 (Currency): Present: False

India: Present: False

Jammu and Jammu: Present: False



In [26]:
keywords = model_infernce(text, model, tokenizer)
keywords = list(set(keywords.split(';')))

input_words = nltk.word_tokenize(text)
for item in keywords:
    print(f'{item}: Present: {True if item in input_words else False}\n')

: Present: False

DC Comics: Present: False

Politics and Government: Present: False

Decisions and Verdicts: Present: False

News media,journalism: Present: False

Captain Marvel: Present: False

Wawcett Comics: Present: False

NYC: Present: False

 copyright: Present: False

Superhero: Present: False

Superheroes: Present: False

Copyrights: Present: False

computers: Present: False

New York State: Present: False

Computers and the Internet: Present: False

Waw: Present: False

Newspapers: Present: False



In [9]:
keywords = model_infernce(text, model, tokenizer)
keywords = list(set(keywords.split(';')))

input_words = nltk.word_tokenize(text)
for item in keywords:
    print(f'{item}: Present: {True if item in input_words else False}\n')

: Present: False

Copyrights: Present: False

Tech Industry: Present: False

Fawcett: Present: True

Computers and the Internet: Present: False

Superhero: Present: False

Superman: Present: True

DC Comics: Present: False

Captain Marvel: Present: False

volution: Present: False

pionage: Present: False

wedcett Comics: Present: False

Copyrights and Copyright Violations: Present: False

copyrights: Present: True

Fawcett Comics: Present: False

Wawcett Comics: Present: False



GPU memory occupied: 747 MB.
