In [1]:
import pandas as pd
import json

# Path to the JSON Lines file
file_path = ["/kaggle/input/marathi-summarization-dataset/marathi-marathi_test.jsonl", "/kaggle/input/marathi-summarization-dataset/marathi-marathi_train.jsonl","/kaggle/input/marathi-summarization-dataset/marathi-marathi_val.jsonl"]


# Initialize an empty list to hold the JSON objects
data = []

# Open the file and read line by line
for i in range(3):
    data.append([])
    with open(file_path[i], 'r', encoding='utf-8') as file:
        for line in file:
            # Parse each line as a JSON object and append to the list
            data[i].append(json.loads(line))
    

# Convert the list of JSON objects into a DataFrame
df_mr = pd.DataFrame(data[1])
df_test_cs_mar = pd.DataFrame(data[0])
df_val_cs_mar = pd.DataFrame(data[2])

# Display the first few rows of the DataFrame to check if it's loaded correctly
print(len(df_mr))
print(len(df_test_cs_mar))
print(len(df_val_cs_mar))

10558
1188
1254


In [2]:
df_mr_train = pd.concat([df_mr, df_test_cs_mar],ignore_index=True)

In [3]:
len(df_mr_train)

11746

In [4]:
df_mr_train['summary'][1]

'रोजीरोटीसाठी महाराष्ट्रात काम करणाऱ्या तामिळनाडूतील थिरुवरुर आणि नागापट्टणम इथल्या सात तरुणांनी तब्बल हजार किलोमीटरचं अंतर पायी कापत गाव गाठलं आहे.'

In [5]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
from transformers import AutoTokenizer

model_checkpoint = "ai4bharat/IndicBART"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,do_lower_case=False, use_fast=False, keep_accents=True)

tokenizer_config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/832 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.90M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/221 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
inputs = tokenizer("I loved reading the Hunger Games!")
inputs

{'input_ids': [2, 466, 50171, 30053, 22, 2371, 10777, 536, 30305, 194, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [8]:
tokenizer.convert_ids_to_tokens(inputs.input_ids)

['[CLS]',
 '▁I',
 '▁loved',
 '▁reading',
 '▁the',
 '▁H',
 'ung',
 'er',
 '▁Games',
 '!',
 '[SEP]']

In [9]:
tokenizer.as_target_tokenizer()

<contextlib._GeneratorContextManager at 0x7fe73cd81750>

In [10]:
max_input_length = 512
max_target_length = 128

def preprocess_function(examples):
    inputs = [ex for ex in examples["text"]]
    targets = [ex for ex in examples["summary"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    # Setup the tokenizer for targets
    # no need this line 
    # with tokenizer.as_target_tokenizer():
    labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [11]:
!pip install datasets==2.15

Collecting datasets==2.15
  Downloading datasets-2.15.0-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow-hotfix (from datasets==2.15)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets==2.15)
  Downloading dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting fsspec<=2023.10.0,>=2023.1.0 (from fsspec[http]<=2023.10.0,>=2023.1.0->datasets==2.15)
  Downloading fsspec-2023.10.0-py3-none-any.whl.metadata (6.8 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
Collecting multiprocess (from datasets==2.15)
  Downloading multiprocess-0.70.15-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K   [90m

In [12]:
from datasets import Dataset
dataset = Dataset.from_pandas(df_mr_train)
len(dataset)

11746

In [13]:
print(dataset[0])

{'source_url': 'https://www.bbc.com/marathi/international-46944888', 'target_url': 'https://www.bbc.com/marathi/international-46944888', 'text': 'गेन्सर टोरोंटोमध्य राहतात आणि त्या 1991पासून शिल्पकला करत आहेत. तब्बल 15 वर्षं अहोरात्र खपून गेन्सर यांनी समुद्रातल्या शिंपल्यांचा चुरा करून अॅडमचं शिल्प तयार केलं. अब्राहम धर्मानुसार अॅडम हा देवानं बनवलेला पहिला मानव मानला जातो. त्यादरम्यानच त्यांना मेंदुच्या Degenerative Autoimmune Disease या विकाराचा सामना करावा लागला. पण हा आजार त्यांना त्यांच्या पेशामुळे झाला हे जेव्हा कळलं तेव्हा फारच उशीर झाला होता. गेन्सर या टोरोंटोमध्य राहतात आणि त्या 1991पासून शिल्पकला करत आहेत. त्या शिल्प बनवण्यासाठी शिंपले, प्रवाळ, सुकलेली पानं आणि कायदेशीर मार्गाने मिळलेली प्राण्याची हाडं यांचा वापर करतात. 1998मध्ये त्यांनी लिलिथ यांचं शिल्प बनवलं. ज्यू लोकांच्या लोककथेप्रमाणे लिलिथ ही शिंपल्यातल्या अंड्यापासून बनलेली पहिली महिला होती. अॅडम यांचं शिल्प निळ्या मझल शिंपल्याच्या घटकांपासून बनवण्याची ही त्यांची स्वत:ची कल्पना होती. कॅनाडाच्या अटलांटिक किनाऱ्याला त्या

In [14]:
dataset_2 = Dataset.from_pandas(df_val_cs_mar)

In [15]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)
tokenized_test_set = dataset_2.map(preprocess_function, batched=True)

Map:   0%|          | 0/11746 [00:00<?, ? examples/s]

Map:   0%|          | 0/1254 [00:00<?, ? examples/s]

In [16]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

pytorch_model.bin:   0%|          | 0.00/976M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [17]:
dataset = {'train': dataset, 'val':dataset_2}

In [18]:
columns = ["input_ids", "labels", "attention_mask"]
tokenized_datasets.set_format(type="torch", columns=columns)

In [19]:
len(dataset["train"])

11746

In [20]:
tokenized_test_set.set_format(type="torch", columns=columns)

In [21]:
model.gradient_checkpointing_enable()

In [24]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [25]:
from transformers import Seq2SeqTrainingArguments

batch_size = 32
num_train_epochs = 8
# Show the training loss with every epoch
logging_steps = len(dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name}_new_3",
    evaluation_strategy="epoch",
    learning_rate=0.001,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.00001,
    fp16=False,
    save_total_limit=1,
    save_strategy="epoch",
    num_train_epochs=num_train_epochs,
    load_best_model_at_end=True,     # Load the best model at the end of training
    metric_for_best_model='loss',    # Use loss to identify the best model
    predict_with_generate=True,
    push_to_hub=True,
    gradient_accumulation_steps=32,
)

In [26]:
tokenized_datasets = tokenized_datasets.remove_columns(
    dataset["train"].column_names
)

In [27]:
tokenized_test_set = tokenized_test_set.remove_columns(
    dataset["val"].column_names
)

In [28]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_test_set,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [29]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
0,No log,3.448747
2,No log,2.993308
4,No log,2.831832
6,No log,2.804729
7,No log,2.805658


Non-default generation parameters: {'forced_eos_token_id': 2}
Non-default generation parameters: {'forced_eos_token_id': 2}
Non-default generation parameters: {'forced_eos_token_id': 2}
Non-default generation parameters: {'forced_eos_token_id': 2}
Non-default generation parameters: {'forced_eos_token_id': 2}
Non-default generation parameters: {'forced_eos_token_id': 2}
Non-default generation parameters: {'forced_eos_token_id': 2}
Non-default generation parameters: {'forced_eos_token_id': 2}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=88, training_loss=3.242799932306463, metrics={'train_runtime': 7560.4869, 'train_samples_per_second': 12.429, 'train_steps_per_second': 0.012, 'total_flos': 4.870845632820019e+16, 'train_loss': 3.242799932306463, 'epoch': 7.65})

In [30]:
trainer.push_to_hub(commit_message="Training complete", tags="summarization")

Non-default generation parameters: {'forced_eos_token_id': 2}


events.out.tfevents.1712717124.1530285e0282.34.0:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/october-sd/IndicBART_new_3/commit/8fb4d475e9b367dbc0d3479fef4778d4b5c4b3f7', commit_message='Training complete', commit_description='', oid='8fb4d475e9b367dbc0d3479fef4778d4b5c4b3f7', pr_url=None, pr_revision=None, pr_num=None)