## Libraries


In [1]:
import torch
import configparser
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5ForConditionalGeneration, T5TokenizerFast, get_scheduler
from torch.optim import AdamW
from tqdm import tqdm
from torch.utils.data import DataLoader


  from .autonotebook import tqdm as notebook_tqdm


### Paths loading

In [2]:
config = configparser.ConfigParser()
config.read('config.ini')

prepared_dataset_path = config['PATHS']['prepared_dataset_path']
base_model_path = config['PATHS']['prepared_dataset_path']
trained_model_path = config['PATHS']['prepared_dataset_path']
model_dir = config['PATHS']['model_dir']

train_tokens_path = config['PATHS']['train_tokens_path']
val_tokens_path = config['PATHS']['val_tokens_path']


## Data loading


In [3]:
df = pd.read_csv(prepared_dataset_path)
df.dropna(how='any', inplace=True)
df.isna().sum()
df.head()


Unnamed: 0,input_text,target_text
0,"1 1/2 lbs cube steaks, 1/4 cup self rising flo...",dredge steak pieces in flour. in a large skill...
1,"1 medium leek, (white portion only), halved an...","in a large saucepan, saute leek in butter unti..."
2,"1 whole chicken, 2 c. cream of chicken soup, s...","boil and bone chicken. mix chicken with soup, ..."
3,"1 crab (about 1 1/2 - 2 pounds), 2 inches ging...",mix the sauce and set aside. clean the crab a...
4,"2 1/2 cups flour, all-purpose, 1 1/2 teaspoons...",preheat oven to 375f (190c) (190c). grease bak...


In [4]:
df_train, df_val = train_test_split(df, test_size=0.1, random_state=0)

In [5]:
del df
df_train.shape, df_val.shape

((9000, 2), (1000, 2))

In [7]:
tokenizer = T5TokenizerFast.from_pretrained(model_dir)

batch_size = 100000

def preprocess_in_batches(df, batch_size=512):
    input_ids, attention_masks, labels = [], [], []

    for i in tqdm(range(0, len(df), batch_size)):
        batch_df = df.iloc[i:i+batch_size]

        # Tokenize inputs
        inputs = tokenizer(
            batch_df["input_text"].tolist(),
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        # Tokenize targets
        targets = tokenizer(
            batch_df["target_text"].tolist(),
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        input_ids.append(inputs["input_ids"])
        attention_masks.append(inputs["attention_mask"])
        labels.append(targets["input_ids"])

        # Optionally free memory
        del inputs, targets

    # Concatenate all batches into single tensors
    return {
        "input_ids": torch.cat(input_ids, dim=0),
        "attention_mask": torch.cat(attention_masks, dim=0),
        "labels": torch.cat(labels, dim=0)
    }

# Use smaller batches if memory is tight
train_data = preprocess_in_batches(df_train, batch_size=batch_size)
del df_train

val_data = preprocess_in_batches(df_val, batch_size=batch_size)
del df_val

100%|█████████████████████████████████████████████| 1/1 [00:05<00:00,  5.75s/it]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.40it/s]


In [8]:
########### Saving token because it is costly to run multiple time

torch.save(train_data, train_tokens_path)
torch.save(val_data, val_tokens_path)


In [9]:
########### Saving token because it is costly to run multiple time

train_data = torch.load(train_tokens_path)
val_data = torch.load(val_tokens_path)

In [10]:
batch_size = 4
num_epochs = 3
lr = 5e-5

train_loader = DataLoader(
    list(zip(train_data["input_ids"], train_data["attention_mask"], train_data["labels"])),
    batch_size=batch_size,
    shuffle=True
)
val_loader = DataLoader(
    list(zip(val_data["input_ids"], val_data["attention_mask"], val_data["labels"])),
    batch_size=batch_size
)

### Model loading

In [11]:
# Load the model safely
model = T5ForConditionalGeneration.from_pretrained(
    model_dir,
    dtype='auto',
    device_map="auto"            # automatically put on GPU if available
)

The module name  (originally ) is not a valid Python identifier. Please rename the original module to avoid import issues.


In [12]:
device = next(model.parameters()).device
print("Model loaded on device:", device)


Model loaded on device: cuda:0


In [13]:
optimizer = AdamW(model.parameters(), lr=lr)
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)


In [None]:
model.device

### Evaluating model performance before finetunning

In [15]:
model.eval()  # no gradient updates
val_loss = 0
all_preds = []
all_labels = []

with torch.no_grad():
    total_loss = 0
    for batch in tqdm(val_loader, desc="Evaluating initial loss"):
        input_ids, attention_mask, labels = batch  # unpack the tuple
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )        
        total_loss += outputs.loss.item()

        # generated_ids = model.generate(
        #     input_ids=input_ids,
        #     attention_mask=attention_mask,
        #     max_length=labels.shape[1],
        #     num_beams=4
        # )

        # preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        # refs = tokenizer.batch_decode(labels, skip_special_tokens=True)

        # all_preds.extend(preds)
        # all_labels.extend(refs)

initial_loss = total_loss / len(val_loader)
print(f"Average loss before fine-tuning: {initial_loss:.4f}")


Evaluating initial loss: 100%|████████████████| 250/250 [04:25<00:00,  1.06s/it]

Average loss before fine-tuning: 23.5920



