In [1]:
import subprocess
import sys

# Function to check if a package is installed
def install_and_import(package):
    try:
        __import__(package)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# List of packages to check and install if necessary
packages = [
    'torchmetrics',
    'sentence-transformers',
    'transformers',
    'gspread',
    'oauth2client',
    'dask'
]

# Checking and installing each package
for package in packages:
    install_and_import(package)

# Now you can import all necessary modules
import os
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
import torch
from torch import nn, optim
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel, AutoConfig
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses
from torchmetrics.classification import BinaryF1Score, BinaryAccuracy, BinaryPrecision, BinaryRecall
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import pandas as pd
import ast





[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
from model_def import *
from custom_data_set import *
from train_script import *
from misc import *
from save_load import *

Batch-aware KDE plots have been generated and displayed.


In [19]:
def get_hyperparameters():
    hyperparameters = {
        'Encoder': 'bert-base-uncased',
        'FFNN': 'True',
        'Same Eno': 'True',
        'Encoder Req Grad': 'False',
        'Freeze Epochs': '0',
        'Bandwith': '0.5',
        'PDF': 'gaussian',
        'Batch Size': '32',
        'Epochs': '10',
        'loss weight': '1.0',
        'lr': '0.001',
        'L2': '0.0'
    }
    return hyperparameters

hyperparameters = get_hyperparameters()
cp = hyperparameters.copy()
cp_path = None

Will be pulling from checkpoint model


Path             CP2_1
Complete         FALSE
Epochs               1
Train Loss            
Test Loss             
Train F1              
Test F1               
Cost per hour         
Time (Hours)          
Date                  
Notes                 
Performance           
Name: 1, dtype: object

In [20]:
model_path = (hyperparameters['Encoder'])
model_config = AutoConfig.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
embedding_dim = model_config.hidden_size
end_pred_sequential = eval(hyperparameters['FFNN'])
same_encoder = True if hyperparameters['Same Eno'].lower() == "true" else False
encoder_grad = True if hyperparameters['Encoder Req Grad'].lower() == "true" else False
freeze_epochs_value = hyperparameters['Freeze Epochs']
vect_settings = {'apply_to': hyperparameters['Freeze Epochs'], 'bandwidth': str(hyperparameters['Bandwith']), 'pdf_type': hyperparameters['PDF']}

try:
    freeze_epochs = int(freeze_epochs_value)
except (ValueError, TypeError):
    freeze_epochs = eval(freeze_epochs_value) if isinstance(freeze_epochs_value, str) else freeze_epochs_value

In [5]:
random_seed = 42
data_path = ["../ data/2.5m.csv"]
batch_size = int(hyperparameters['Batch Size'])
model_config = AutoConfig.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
test_size = .025
collate = Collate_Fn()

train_dataset = Load_Py_CSV(data_path, tokenizer=tokenizer, test_size=test_size)
test_dataset  = Load_Py_CSV(data_path, tokenizer=tokenizer, test_size=test_size, train=False)

train_sampler = BatchLenSampler(data_source=train_dataset, batch_size=batch_size, seed=random_seed)
test_sampler = BatchLenSampler(data_source=test_dataset, batch_size=batch_size, seed=random_seed)

train_dataloader = DataLoader(train_dataset, sampler=train_sampler, collate_fn=collate)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, collate_fn=collate)

# Iterate over the dataloader to get a random batch

Cleaning done, now splitting


Calculating lengths: 100%|██████████| 47/47 [00:01<00:00, 39.68it/s]


Loaded 47916 samples, train=True
Cleaning done, now splitting


Calculating lengths: 100%|██████████| 2/2 [00:00<00:00, 67.66it/s]

Loaded 1229 samples, train=False





In [21]:
device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'


model = TSN_model(
    encoder_model=model_path,
    device=device,
    same_encoder=same_encoder,
    encoder_tune=encoder_grad,
    end_pred_sequential=end_pred_sequential,
    kde_config=vect_settings
).to(device)

Encoder(s) trainable status set to: True


In [22]:
Epochs = cp['Epochs'] if checkpoint_mode else hyperparameters['Epochs']

metrics_measure = [BinaryF1Score(threshold=.5).to(device), BinaryPrecision(threshold=.5).to(device), BinaryRecall(threshold=.5).to(device)]

bi_ratio_weight = torch.Tensor([float(hyperparameters["loss weight"])])

loss_fn = nn.BCEWithLogitsLoss(pos_weight=bi_ratio_weight).to(device)

lr = float(hyperparameters['lr'])

param_list = [{"params": model.prompt_encoder.parameters(), "weight_decay":0},
               {"params": model.caption_encoder.parameters(), "weight_decay":0},
               {"params": model.pred_layer.parameters(), "weight_decay":float(hyperparameters['L2'])},]

if cp_path:
    print("Loading model from checkpoint")
    model, optim_fn, start_epoch, loaded_metrics = load_model(filepath=cp_path,
                                                 model=model,
                                                 device=device)
else:
    optim_fn = None
    start_epoch = 0
    
optim_fn = optim_fn or optim.Adam(param_list, lr=lr)

Loading model from checkpoint
Model loaded from ../../saved_models/checkpoints/CP1_16.pt
Resuming from epoch 1


In [23]:
torch.manual_seed(42)
new_metrics = train_model(model=model, 
                      tokenizer=tokenizer, 
                      optim=optim_fn, 
                      loss_fn=loss_fn, 
                      train_data=train_dataloader, 
                      test_data=test_dataloader, 
                      metrics=metrics_measure, 
                      epochs=Epochs, 
                      device=device,
                      freeze_epochs=freeze_epochs, 
                      start_epochs=start_epoch)

                                                           

Epoch: 2 | Train Loss: 0.858 |BinaryF1Score: 0.285, BinaryPrecision: 0.220, BinaryRecall: 0.406 | Test Loss: 0.830 | BinaryF1Score: 0.339, BinaryPrecision: 0.265, BinaryRecall: 0.480


In [28]:
loaded_metrics

{'train_loss_ar': [0.887244701385498],
 'train_metrics_ar': [tensor(0.2598, device='mps:0'),
  tensor(0.2069, device='mps:0'),
  tensor(0.3792, device='mps:0')],
 'test_loss_ar': [0.8447503447532654],
 'test_metrics_ar': [tensor(0.3331, device='mps:0'),
  tensor(0.2718, device='mps:0'),
  tensor(0.4339, device='mps:0')],
 'time_elapsed': 105.9577488899231}

In [29]:
if cp_path and loaded_metrics:
    combined_metrics = {
        "train_loss_ar": loaded_metrics["train_loss_ar"] + new_metrics["train_loss_ar"],
        "test_loss_ar": loaded_metrics["test_loss_ar"] + new_metrics["test_loss_ar"],
        "time_elapsed": loaded_metrics["time_elapsed"] + new_metrics["time_elapsed"]
    }
    
    # Function to safely concatenate tensors or convert scalars to tensors
    def safe_cat(t1, t2):
        if t1.dim() == 0:
            t1 = t1.unsqueeze(0)
        if t2.dim() == 0:
            t2 = t2.unsqueeze(0)
        return torch.cat([t1, t2])
    
    # Combine train and test metrics
    combined_metrics["train_metrics_ar"] = [
        safe_cat(loaded_metrics["train_metrics_ar"][i], new_metrics["train_metrics_ar"][i])
        for i in range(len(loaded_metrics["train_metrics_ar"]))
    ]
    combined_metrics["test_metrics_ar"] = [
        safe_cat(loaded_metrics["test_metrics_ar"][i], new_metrics["test_metrics_ar"][i])
        for i in range(len(loaded_metrics["test_metrics_ar"]))
    ]
else:
    combined_metrics = new_metrics

train_loss_ar, test_loss_ar = combined_metrics["train_loss_ar"], combined_metrics["test_loss_ar"]
time_elapsed = combined_metrics['time_elapsed']

# Decompose metrics
metric_names = ['F1 Score', 'Precision', 'Recall']
train_metrics = [tensor.cpu().numpy() for tensor in combined_metrics["train_metrics_ar"]]
test_metrics = [tensor.cpu().numpy() for tensor in combined_metrics["test_metrics_ar"]]

# Create metrics array for plotting
metrics_array = np.array([
    [train_metrics[i], test_metrics[i]] for i in range(len(train_metrics))
])

# Create metrics array for plotting
metrics_ar = [{"name": 'Loss', "test_data": test_loss_ar, "train_data": train_loss_ar}] + [
    {"name": name, "test_data": test, "train_data": train}
    for name, test, train in zip(metric_names, test_metrics, train_metrics)
]

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Model Metrics')
axs = axs.ravel()

for ax, diction in zip(axs, metrics_ar):
    ax.set_title(diction["name"] + ' over time')
    ax.set_ylabel(diction["name"])
    ax.set_xlabel('Epochs')
    ax.plot(diction["test_data"], label='test')
    ax.plot(diction["train_data"], label='train')
    ax.legend()

plt.tight_layout()
plt.show()

In [34]:
def safe_round(value, decimals=3):
    """
    Safely round a value whether it's a tensor, numpy array, or a regular number.
    
    Args:
    value: The value to round. Can be a tensor, numpy array, or a regular number.
    decimals: The number of decimal places to round to. Default is 3.
    
    Returns:
    The rounded value as a Python float.
    """
    if isinstance(value, (np.ndarray, list)):
        # If it's an array or list, round the last element
        return round(float(value[-1]), decimals)
    try:
        # Try to use .item() method (for tensors)
        return round(value.item(), decimals)
    except AttributeError:
        # If .item() method is not available, it's likely a regular number
        return round(value, decimals)
    
    
params = hyperparameters

params['Train Loss'] = safe_round(train_loss_ar[-1], 3)
params['Test Loss'] = safe_round(test_loss_ar[-1], 3)
params['Train F1'] = safe_round(metrics_ar[1]["train_data"],3) 
params['Test F1'] = safe_round(metrics_ar[1]["test_data"], 3)
params['Time (Hours)'] = safe_round(time_elapsed)/3600
params['Cost'] = 0
params['Date'] = current_date_with_ordinal()
params['Complete'] = True

In [36]:
save_path = "../checkpoints/"

create_path_if_not_exist(save_path)
full_path =  save_path + f"{cp['Path']}.pt"
save_model(model=model, 
           optimizer=optim_fn,
           epoch=start_epoch + Epochs,
           metrics=combined_metrics,
           filepath=full_path)

Directory ../../saved_models/checkpoints already exists.
Model saved to ../../saved_models/checkpoints/CP2_1.pt
