Load the data


In [69]:
import json
from datasets import Dataset
import pandas as pd
import peft
import os
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer

In [50]:
# Providing the path of the input file 

input_file_train  = 'data/billsum_train.json'  # Replace with your JSON file path
output_file_train = 'data/output_train.jsonl'  # Path to save the JSONL file

# Reading the input data file 
with open(input_file_train, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Write the data to a JSONL file
with open(output_file_train, 'w', encoding='utf-8') as f:
    for record in data:
        json.dump(record, f, ensure_ascii=False)
        f.write('\n')  # Write a newline after each record to separate the JSON objects

input_file_val  = 'data/billsum_val.json'  # Replace with your JSON file path
output_file_val = 'data/output_val.jsonl'  # Path to save the JSONL file

# Reading the input data file 
with open(input_file_val, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Write the data to a JSONL file
with open(output_file_val, 'w', encoding='utf-8') as f:
    for record in data:
        json.dump(record, f, ensure_ascii=False)
        f.write('\n')  # Write a newline after each record to separate the JSON objects


Loading the data

In [51]:
def load_jsonl(file_path):
    with open(file_path, 'r') as file:
        data = [json.loads(line.strip()) for line in file]
    return data

# Example usage
file_path = 'data/output.jsonl'  # Replace with your actual file path
data = pd.read_json("data/output_train.jsonl", lines=True)
dev_df = pd.read_json("data/output_val.jsonl", lines=True)

Checking the loaded data

In [52]:
data[0:3]

Unnamed: 0,text,summary
0,The people of the State of California do enact...,(1) Existing law regulates pawnbrokers and req...
1,The people of the State of California do enact...,Existing property tax law establishes a vetera...
2,The people of the State of California do enact...,"Existing law, the Federal Surplus Property Acq..."


Selecting the model 

In [53]:
# # I have selected the available model fron chatgpt right now from transformers pipeline. As the used model is not available from the transformers library. But the optimizing process will be same. Need to check the target modules.

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")

Tokenoze.

In [54]:
def preprocess_data(df, tokenizer):
    def tokenize_function(examples):
        inputs = tokenizer(
            examples['text'],
            truncation=True,
            padding='max_length',
            max_length=128,
            return_tensors='pt'
        )
        targets = tokenizer(
            examples['summary'],
            truncation=True,
            padding='max_length',
            max_length=128,
            return_tensors='pt'
        )

        # Ensure that targets are converted to lists of integers
        inputs['labels'] = targets['input_ids'].tolist()

        return inputs

    # Create a Dataset from the DataFrame
    dataset = Dataset.from_pandas(df[['text', 'summary']])
    return dataset.map(tokenize_function, batched=True, remove_columns=['text', 'summary'])

Padding

In [55]:
# Set the padding token
tokenizer.pad_token_id = tokenizer.eos_token_id



# Add pad token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))


train_dataset = preprocess_data(data, tokenizer)
# test_dataset = preprocess_data(test_df, tokenizer)
dev_dataset = preprocess_data(dev_df, tokenizer)

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

- rank of the low rank adaptation(r) = 8
- Scaling factor lora_alpha = 16 
- lora_dropout = 0.1
- target_modules = ["q", "k", "v", "o"]


In [56]:

lora_config = LoraConfig(
    r=8,                      # Rank of the low-rank adaptation (higher = more capacity)
    lora_alpha=16,           # Scaling factor for updates (higher = larger updates)
    lora_dropout=0.1,        # Dropout rate for LoRA layers (higher = more regularization)
    target_modules=["q", "k", "v", "o"]  # Layers to apply LoRA (more layers = more complexity)
)

In [71]:

import pandas as pd

from transformers import TrainerCallback


class LossLogger(TrainerCallback):
    def __init__(self):
        self.losses = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        """Called during logging events, logs contain loss and other metrics."""
        if logs and "loss" in logs:
            self.losses.append(logs["loss"])

    def save_losses(self, output_dir):
        """Save the logged losses to a file."""
        os.makedirs(output_dir, exist_ok=True)
        file_path = os.path.join(output_dir, "losses.txt")
        with open(file_path, "w") as f:
            for loss in self.losses:
                f.write(f"{loss}\n")
        print(f"Losses saved to {file_path}")


In [72]:
loss_logger = LossLogger()

In [73]:
# Define a writable directory for model outputs and logs
output_dir = './t5large-finetuned-lora'  # Ensure this is writable in your current working directory
logging_dir = './logs'  # Directory for storing logs, ensure it's writable

# Create the directories if they do not exist
os.makedirs(output_dir, exist_ok=True)
os.makedirs(logging_dir, exist_ok=True)

# Define the training settings
training_args = TrainingArguments(
    output_dir=output_dir,            # Directory for saving the trained model
    per_device_train_batch_size=8,    # Batch size for training
    per_device_eval_batch_size=8,     # Batch size for evaluation
    num_train_epochs=1,               # Number of training cycles
    logging_dir=logging_dir,          # Directory for logs
    logging_steps=25,                 # Frequency of logging
    evaluation_strategy="epoch",      # Evaluate at the end of each epoch
    eval_steps=25,                    # Frequency of evaluations
    save_strategy="epoch",            # Save the model after each epoch
    learning_rate=1e-4,               # Learning rate
    weight_decay=0.01,                # Weight decay
    remove_unused_columns=False       # Keep all dataset columns for debugging
)

# Initialize the Trainer with all the configurations and datasets
trainer = Trainer(
    model=model,                              # The model to train
    args=training_args,                       # Training settings
    train_dataset=train_dataset,              # Dataset for training
    eval_dataset=dev_dataset,                 # Dataset for evaluation
    tokenizer=tokenizer,                      # Tokenizer for text processing
    callbacks=[loss_logger]                   # Attach the loss logger
)



  trainer = Trainer(


Train 


In [78]:
# Train the model
trainer.train()
trainer.save_model()

# Evaluate the model
eval_results = trainer.evaluate()

# Print the evaluation results
print("Evaluation Results:", eval_results)

# Save the losses to CSV files
loss_logger.save_losses('/logs')

Model saving in the local file 

In [76]:
os.getcwd()

'/Users/pavankumar/Documents/Winter_Semester24/Applications/AI_startup/Actual_work/Getting_the_summaries_of_each_response_from_model'

In [None]:
save_path = '/finetuned model'
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

Try the model 

In [None]:
prompt = "Train a model for predictive maintenance"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")

outputs = model.generate(input_ids=input_ids, max_new_tokens=50)

# Decode the generated tokens
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

Results

In [None]:
train_predictive_maintenance_model(X='maintenance_data', y='predictive_labels')