In [29]:
    from datasets import Dataset
    from transformers import AutoTokenizer
    import os
    os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
    # # Set CUDA_VISIBLE_DEVICES to -1 to use CPU only
    # os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [2]:
csv_file_path = "part_1.csv"

In [3]:
dataset = Dataset.from_csv(csv_file_path)

Generating train split: 344277 examples [00:05, 63754.42 examples/s]


In [4]:
print(dataset["Consumer complaint narrative"][:5])
print(dataset["Issue"][:5])

["Ive noticed that several deviations from mandatory reporting standards therefore the data in my report can not be accurate and complete! With that said, I deny your allegations and I challenge you to provide proof that you have the right to report this incomplete negative information on my credit report. \n\nPer 15 U.S.C 1681e Whenever a consumer reporting agency prepares a consumer report it shall follow reasonable procedures to assure maximum possible accuracy of the information concerning the individual about whom the report relates. \n\nAccording to 12 CFR Part 1022 ( Regulation V ) requires furnishers to : Furnish information concerning accounts or other relationships with a consumer that has integrity. \nFurnish information about accounts or other relationships with a consumer that is accurate. \nC - Conduct reasonable investigations of consumer disputes According to XXXX, in order to make a lawful report, you must follow the Metro 2 compliance standards. \n\nAlso as stated in 

In [5]:
dataset = dataset.train_test_split(test_size=0.2)

In [6]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")

In [7]:
example = dataset["train"][0]

In [8]:
tokenized_text = tokenizer(example['Consumer complaint narrative'])
for key in tokenized_text:
    print(key)
    print(tokenized_text[key])

input_ids
[86, 4408, 663, 28, 8, 4506, 6529, 3750, 53, 1810, 3, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 6288, 1713, 3, 4, 4, 4, 4, 6, 3, 4, 4, 4, 4, 3, 4, 4, 4, 4, 3, 4, 4, 4, 4, 3, 4, 4, 4, 4, 6288, 1713, 3, 4, 4, 4, 4, 6, 3, 4, 4, 4, 4, 3, 4, 4, 4, 4, 3, 4, 4, 4, 4, 3, 4, 4, 4, 4, 6288, 1713, 3, 226, 19230, 19230, 3, 4, 4, 4, 4, 6, 3, 4, 4, 4, 4, 3, 4, 4, 4, 4, 3, 4, 4, 4, 4, 6288, 1713, 3, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 6, 3, 4, 4, 4, 4, 3, 4, 4, 4, 4, 6, 11, 3, 4, 4, 4, 4, 3, 4, 4, 4, 4, 3, 4, 4, 4, 4, 6288, 1713, 3, 19230, 19230, 19230, 3, 4, 4, 4, 4, 6, 65, 15108, 26, 82, 2166, 5, 627, 412, 5, 134, 5, 254, 898, 4959, 1375, 1640, 357, 71, 5, 1323, 27, 43, 8, 269, 12, 4570, 5, 627, 412, 5, 134, 5, 254, 898, 4959, 5568, 1640, 591, 71, 5568, 204, 3, 10, 94, 92, 2315, 3, 9, 3733, 5099, 3193, 54, 59, 26808, 3, 9, 905, 406, 82, 1545, 3909, 100, 998, 127, 65, 118, 16, 1223, 12374, 13, 82, 2166, 57, 5099, 27801, 251, 30, 82, 998, 934, 5, 1]
attention_mask
[1, 1, 1, 1, 1, 1, 1, 1, 1

In [9]:
def preprocess_function(examples):
    # Prepends the string "summarize: " to each document in the 'text' field of the input examples.
    # This is done to instruct the T5 model on the task it needs to perform, which in this case is summarization.
    inputs = ["summarize: " + doc for doc in examples["Consumer complaint narrative"]]

    # Tokenizes the prepended input texts to convert them into a format that can be fed into the T5 model.
    # Sets a maximum token length of 1024, and truncates any text longer than this limit.
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    # Tokenizes the 'summary' field of the input examples to prepare the target labels for the summarization task.
    # Sets a maximum token length of 128, and truncates any text longer than this limit.
    labels = tokenizer(text_target=examples["Issue"], max_length=128, truncation=True)

    # Assigns the tokenized labels to the 'labels' field of model_inputs.
    # The 'labels' field is used during training to calculate the loss and guide model learning.
    model_inputs["labels"] = labels["input_ids"]

    # Returns the prepared inputs and labels as a single dictionary, ready for training.
    return model_inputs

In [10]:
tokenized_billsum = dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 275421/275421 [01:32<00:00, 2966.80 examples/s]
Map: 100%|██████████| 68856/68856 [00:23<00:00, 2962.99 examples/s]


In [11]:
tokenized_billsum['test'][0]['Consumer complaint narrative']

'Requested information to pay off homeXXXX Received first incorrect payoff statement-XXXX. This was incorrect because of the amount of interest due, an item called " Funds owed by borrower \'\', and because the amount of taxes paid by the lender. The lender is also the home builder and paid taxes on the property but paid them late which resulted in late fees. \n\nSince XX/XX/XXXX we have received multiple payoff statements. United Built Homes ( UBH ) has not responded to requests for an explanation of Funds owed by Borrower, has not stated if they will pay for late fees due to their late payment on property taxes, and on the most recent payoff statement, UBH has included insurance and property taxes with these disbursement dates : property taxes XX/XX/XXXX and insurance XX/XX/XXXX. We were told in email to disregard the insurance, but the most recent payoff statement does not reflect this. I have asked for documentation from UBH that the current year \'s taxes have been paid and have n

In [12]:
tokenized_billsum['test'][0]['Issue']

'Trouble during payment process'

In [13]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model="t5-small")

Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.


In [14]:
import evaluate

rouge = evaluate.load("rouge")

In [15]:
import numpy as np

def compute_metrics(eval_pred):
    # Unpacks the evaluation predictions tuple into predictions and labels.
    predictions, labels = eval_pred

    # Decodes the tokenized predictions back to text, skipping any special tokens (e.g., padding tokens).
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replaces any -100 values in labels with the tokenizer's pad_token_id.
    # This is done because -100 is often used to ignore certain tokens when calculating the loss during training.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Decodes the tokenized labels back to text, skipping any special tokens (e.g., padding tokens).
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Computes the ROUGE metric between the decoded predictions and decoded labels.
    # The use_stemmer parameter enables stemming, which reduces words to their root form before comparison.
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    # Calculates the length of each prediction by counting the non-padding tokens.
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]

    # Computes the mean length of the predictions and adds it to the result dictionary under the key "gen_len".
    result["gen_len"] = np.mean(prediction_lens)

    # Rounds each value in the result dictionary to 4 decimal places for cleaner output, and returns the result.
    return {k: round(v, 4) for k, v in result.items()}

In [16]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [17]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

In [31]:
training_args = Seq2SeqTrainingArguments(
    output_dir="my_fine_tuned_t5_small_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
)

In [32]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [33]:
import torch
torch.cuda.empty_cache()

In [34]:
trainer.train()

  0%|          | 0/110172 [02:12<?, ?it/s]
  0%|          | 3/110172 [00:26<250:45:03,  8.19s/it]

OutOfMemoryError: CUDA out of memory. Tried to allocate 320.00 MiB. GPU 0 has a total capacity of 4.00 GiB of which 0 bytes is free. Of the allocated memory 6.01 GiB is allocated by PyTorch, and 745.59 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [22]:
import torch

# Check the name of the current CUDA device
device_name = torch.cuda.get_device_name()
print("Current CUDA device name:", device_name)

# Alternatively, you can specify the device index
device_index = 0  # Example: Check the name of device at index 0
device_name_at_index = torch.cuda.get_device_name(device_index)
print(f"CUDA device name at index {device_index}:", device_name_at_index)


Current CUDA device name: NVIDIA GeForce GTX 1650
CUDA device name at index 0: NVIDIA GeForce GTX 1650
