# FineTuning [Nepali Casual LM](https://www.kaggle.com/datasets/reganmaharjan/nepali-trained-gpt2-casual-language-model) Model For Covid19 Related Headline Generator
### Dataset used here is mixture [NepCov19News](https://www.kaggle.com/datasets/reganmaharjan/nepcov19news) and [NepCov19Tweets](https://www.kaggle.com/datasets/mathew11111/nepcov19tweets)

### NepCov19News is collection of Covid Related News Headlines from 3 different News portals.
**Note: Here I am using NepCov19Tweets as well because what can be observed is Tweets are often like a News headlines and summaries, making a statement and being informative in as few words as possible.**

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
import tensorflow as tf
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



/kaggle/input/nepcov19news/NepCovNews.csv
/kaggle/input/googletrans-augment-data/googletrans_augmented_data.csv
/kaggle/input/nepali-trained-gpt2-casual-language-model/gpt2NepaliCasualLM/config.json
/kaggle/input/nepali-trained-gpt2-casual-language-model/gpt2NepaliCasualLM/tf_model.h5
/kaggle/input/nepali-trained-gpt2-casual-language-model/gpt2NepaliCasualLM/generation_config.json
/kaggle/input/nepali-tokenizers/Nepali_Wordpiece.tokenizer
/kaggle/input/nepali-tokenizers/Nepali_BPE.tokenizer
/kaggle/input/preprocess-nepcov19tweets/__results__.html
/kaggle/input/preprocess-nepcov19tweets/__resultx__.html
/kaggle/input/preprocess-nepcov19tweets/__notebook__.ipynb
/kaggle/input/preprocess-nepcov19tweets/__output__.json
/kaggle/input/preprocess-nepcov19tweets/custom.css
/kaggle/input/preprocess-nepcov19tweets/preprocess-nepcov19tweets/state.json
/kaggle/input/preprocess-nepcov19tweets/preprocess-nepcov19tweets/dataset_info.json
/kaggle/input/preprocess-nepcov19tweets/preprocess-nepcov19twee

In [2]:
%%time
import datasets #huggingface datasets

gt_aug_data = pd.DataFrame({'text':pd.read_csv("/kaggle/input/googletrans-augment-data/googletrans_augmented_data.csv")['ne']})
nep_cov_news_data = pd.read_csv("/kaggle/input/nepcov19news/NepCovNews.csv")[['text']]
nep_cov_tweets_data = pd.DataFrame({'text':datasets.Dataset.load_from_disk("/kaggle/input/preprocess-nepcov19tweets/preprocess-nepcov19tweets")['Sentences']})

# print(gt_aug_data.shape)
# print(nep_cov_news_data.shape)
# print(nep_cov_tweets_data.shape)

data = pd.concat([gt_aug_data,nep_cov_news_data,nep_cov_tweets_data])
data.drop_duplicates(inplace=True)
data

CPU times: user 543 ms, sys: 68 ms, total: 611 ms
Wall time: 615 ms


Unnamed: 0,text
0,कोभिड भ्याक्सिन पनि लगाइयो
1,रामेछापमा कोभिड–१९ सङ्क्रमितको सङ्ख्या ४८ पुगे...
2,स्वास्थ्य मन्त्रालयले कोभिड–१९ को रोकथाम तथा न...
3,"कोभिड छ, अहिले पनि सामाजिक दूरी कायम गरेका छौं..."
4,संयुक्त राज्य अमेरिकाले कोभिड-१९ को मृत्युमा न...
...,...
33445,विश्व स्वास्थ्य संगठनले कोरोना भाइरसबाट हुने र...
33449,कोरोना महामारी मंगलबारमात्रै जनाको मृत्यु औपचा...
33458,कोभिड वारि सिन्धुपाल्चोक पुगेको जस्तो छ सरजी ए...
33463,कोरोनाको न्वारानको नाम कोभिड विश्व स्वास्थ्य स...


In [3]:
data = datasets.Dataset.from_pandas(data).remove_columns(column_names=['__index_level_0__'])
data = data.shuffle(999).train_test_split(test_size=0.002)
gc.collect()
print(data)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 50235
    })
    test: Dataset({
        features: ['text'],
        num_rows: 101
    })
})


In [4]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from transformers import PreTrainedTokenizerFast

print("Initializing tokenizer as PreTrainedTokenizerFast")
tokenizer = PreTrainedTokenizerFast(tokenizer_object=Tokenizer(BPE()).from_file("/kaggle/input/nepali-tokenizers/Nepali_BPE.tokenizer"))
tokenizer.add_special_tokens({'pad_token': '[PAD]',"eos_token": "[SEP]", "bos_token":"[CLS]"})

Initializing tokenizer as PreTrainedTokenizerFast


0

In [5]:
def preprocess_function(rows):
    return tokenizer(rows['text'])

In [6]:
%%time
print("Tokenizing the data")
tokenized_inputs = data.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=data["train"].column_names,
)
tokenized_inputs = tokenized_inputs.remove_columns(['token_type_ids'])
tokenized_inputs

Tokenizing the data
     

#0:   0%|          | 0/13 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/13 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/13 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/13 [00:00<?, ?ba/s]

     

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

CPU times: user 865 ms, sys: 703 ms, total: 1.57 s
Wall time: 7.81 s


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 50235
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 101
    })
})

In [7]:
block_size = 128

def group_texts(rows):
    # Concatenate all texts.
    concatenated_rows = {k: sum(rows[k], []) for k in rows.keys()}
    total_length = len(concatenated_rows[list(rows.keys())[0]])
    remainder = total_length
    
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
        remainder -=total_length
        
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_rows.items()
    }
    
    if(remainder):
        for k in result.keys():
            result[k].append(concatenated_rows[k][-128:])
        
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
%%time
print("Grouping Tokens to Model Input Size")
lm_data = tokenized_inputs.map(group_texts, batched=True, num_proc=4)
lm_data

Grouping Tokens to Model Input Size
     

#0:   0%|          | 0/13 [00:00<?, ?ba/s]

  

#1:   0%|          | 0/13 [00:00<?, ?ba/s]

#2:   0%|          | 0/13 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/13 [00:00<?, ?ba/s]

     

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

   

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
from transformers import DataCollatorForLanguageModeling

print("Initializing Data Collator")
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, 
                                                mlm=False, 
                                                return_tensors="tf")

In [None]:
from transformers import TFAutoModelForCausalLM, AutoConfig

## To change the size of embedding - N_EMBED must be properly divisible by the size N_HEAD value

model = TFAutoModelForCausalLM.from_pretrained('/kaggle/input/nepali-trained-gpt2-casual-language-model/gpt2NepaliCasualLM',
                                                n_head=12,
                                                bos_token_id=tokenizer.bos_token_id,
                                                eos_token_id=tokenizer.eos_token_id,
                                                pad_token_id=tokenizer.pad_token_id,
                                                id2label={0:"NEUTRAL",1:"POSITIVE",2:"NEGATIVE"},
                                                label2id={"NEUTRAL":0,"POSITIVE":1,"NEGATIVE":2})
model.resize_token_embeddings(len(tokenizer))
print(model.config)
model.summary()

In [None]:
from transformers import create_optimizer, AdamWeightDecay

optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.001)
model.compile(optimizer=optimizer)

In [None]:
print("Preparing Training and Testing sets to TRAIN the MODEL")
tf_train_set = model.prepare_tf_dataset(
    lm_data["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_test_set = model.prepare_tf_dataset(
    lm_data["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

## Training the Model

In [None]:
%%time

print("Training the model")
history = model.fit(x=tf_train_set, 
          validation_data=tf_test_set,
          epochs=10)
model.save_pretrained("/kaggle/working/CovidNewsHeadlineGenerator")
print(history.history)

In [None]:
from seaborn import lineplot
from matplotlib import pyplot as plt

lineplot(history.history['loss'],)
lineplot(history.history['val_loss'])

plt.plot()

In [None]:
from transformers import pipeline

model_pipeline = pipeline("text-generation",model=model,tokenizer=tokenizer,framework='tf')

model_pipeline(["कोरोना संक्रमित भेटिएपछि",
                "भारतमा एकै दिनमा कोभिड-१९ सङक्रमित",
                "के मास्कले भाइरस",
                "थपिए ४५ जना",
               "स्वास्थ्य तथा जनसंख्या मन्त्रालयले",
                "कोरोना नियन्त्रणमा स्थानीय",
                "डेंगु संक्रमणबाट सिकिस्त"])

In [None]:
from huggingface_hub import login

login(token='')
model.push_to_hub('raygx/Covid-News-Headline-Generator',commit_message="Finetuned raygx/GPT2-Nepali-Casual-LM model for generating Covid-News; 10 Epochs")

In [None]:
# tokenizer.push_to_hub('raygx/Covid-News-Headline-Generator',commit_message='Uploading Tokenizer')