In [None]:
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset, DatasetDict
from collections import defaultdict

In [None]:
# Step 1: Segment Text by Emotion
def segment_text_by_emotion(df):
    emotion_segments = defaultdict(list)
    current_emotion = None
    current_segment = []

    for _, row in df.iterrows():
        utterance = row['Utterance']
        emotion = row['Emotion']

        if current_emotion is None:
            current_emotion = emotion

        if emotion != current_emotion:
            emotion_segments[current_emotion].append(' '.join(current_segment))
            current_emotion = emotion
            current_segment = []

        current_segment.append(utterance)

    if current_segment:
        emotion_segments[current_emotion].append(' '.join(current_segment))

    return emotion_segments

In [None]:
# Step 2: Prepare the Data for Fine-Tuning
def prepare_dataset(emotion_segments):
    data = {
        "input_text": [],
        "summary_text": [],
    }

    for emotion, segments in emotion_segments.items():
        for segment in segments:
            summary = f"This segment expresses {emotion}."
            data["input_text"].append(segment)
            data["summary_text"].append(summary)

    dataset = Dataset.from_dict(data)
    return DatasetDict({"train": dataset, "validation": dataset})

In [2]:
# Step 3: Fine-Tune BART Model
def fine_tune_bart(train_dataset, val_dataset, output_dir="./bart-emotion-summarization"):
    tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
    model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

    def tokenize_function(examples):
        model_inputs = tokenizer(examples["input_text"], max_length=1024, padding="max_length", truncation=True)
        labels = tokenizer(examples["summary_text"], max_length=150, padding="max_length", truncation=True)
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    tokenized_train = train_dataset.map(tokenize_function, batched=True)
    tokenized_val = val_dataset.map(tokenize_function, batched=True)

    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

    training_args = TrainingArguments(
        output_dir=output_dir,
        eval_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=3,
        weight_decay=0.01,
        save_total_limit=2,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        data_collator=data_collator,
    )

    trainer.train()
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

# Main Execution Flow
if __name__ == "__main__":
    df = pd.read_csv("/kaggle/input/train-sent-emo-csv/train_sent_emo.csv")
    emotion_segments = segment_text_by_emotion(df)
    dataset = prepare_dataset(emotion_segments)
    fine_tune_bart(dataset["train"], dataset["validation"])


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Map:   0%|          | 0/6096 [00:00<?, ? examples/s]

Map:   0%|          | 0/6096 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,0.0096,0.057192
2,0.0065,0.047271
3,0.0041,0.032401


Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_

In [5]:
!ls ./bart-emotion-summarization

  pid, fd = os.forkpty()


checkpoint-4500  generation_config.json  runs			  vocab.json
checkpoint-4572  merges.txt		 special_tokens_map.json
config.json	 model.safetensors	 tokenizer_config.json


In [4]:
import shutil

# Zip the directory containing the saved model
shutil.make_archive("bart-emotion-summarization", 'zip', 'out')


'/kaggle/working/bart-emotion-summarization.zip'

In [6]:
import os
import subprocess
from IPython.display import FileLink, display

def download_file(path, download_file_name):
    os.chdir('/kaggle/working/bart-emotion-summarization')
    zip_name = f"/kaggle/working/bart-emotion-summarization{download_file_name}.zip"
    command = f"zip {zip_name} {path} -r"
    result = subprocess.run(command, shell=True, capture_output=True, text=True)
    if result.returncode != 0:
        print("Unable to run zip command!")
        print(result.stderr)
        return
    display(FileLink(f'{download_file_name}.zip'))


download_file('/kaggle/working/bart-emotion-summarization', 'out')

Unable to run zip command!

