In [2]:
import torch
from torch import nn, optim
import torchvision as vis
from torch.utils import data
from torch.nn.utils.rnn import *
import numpy as np
from tqdm import tqdm
import math

import os, sys

has_cuda = torch.cuda.is_available()
# has_cuda = False
is_windows = sys.platform == "win32"

if has_cuda:
  print(torch.cuda.get_device_name(0))
else:
  print("CPU")
device = torch.device("cuda:0" if has_cuda else "cpu")

Tesla T4


In [5]:
from model_helper import ModelSaver, load_model, model_store, StoredModel
from dataset.triplet_dataset import TextTripletDataset

In [11]:
train_dataset = TextTripletDataset(
  "subset_splits/train_set.csv",
  "/home/ubuntu/data/transcriptions",
  300)

# validation_dataset = TextTripletDataset(
#   "subset_splits/val_set.csv",
#   "/home/ubuntu/data/transcriptions")

In [12]:
train_dataloader_args = dict(batch_size=256,
                             num_workers=torch.get_num_threads() * 2 if not is_windows else 0) if has_cuda else dict(batch_size=64)
train_dataloader_args["shuffle"] = True
train_dataloader_args["collate_fn"] = TextTripletDataset.collate_fn

# validation_dataloader_args = train_dataloader_args.copy()
# validation_dataloader_args["shuffle"] = False

train_dataloader = data.DataLoader(train_dataset, **train_dataloader_args)
# validation_dataloader = data.DataLoader(validation_dataset, **validation_dataloader_args)

In [13]:
from model.language_model import LSTMLanguageModel, get_last_element

In [6]:
model_id = "lang_model_01"
epoch_start, model, optimizer, scheduler, criterion = load_model(model_id, device, 18)

resuming from last checkpoint epoch_18


In [14]:
model_id = "lang_model_02"

model = LSTMLanguageModel(vocab_size = 49408)

epoch_start = 1
model.to(device)
print(model)

LSTMLanguageModel(
  (embedding): Embedding(49408, 400)
  (rnn): Sequential(
    (0): LSTM(400, 1500)
    (1): LockedDropout(p=0.2)
    (2): LSTM(1500, 1500)
    (3): LockedDropout(p=0.2)
    (4): LSTM(1500, 400)
  )
  (word_prob): Identity()
)


In [7]:
import os
try:
  os.mkdir(f"{model_store}/{model_id}")
except:
  print("WARN: Directory exists")
# save model summary to a txt file
with open(f"{model_store}/{model_id}/model_spec.txt", "w") as file:
  file.write(str(model) + "\n")
  # file.write(model_spec)

In [15]:
def run_model(model, dataloader):
  cumulative_stats = {
    "loss": 0.0
  }
  
  num_batches = 0
  
  prog_bar = tqdm(dataloader, desc="Train" if model.training else "Eval ")

  for (anchor, anchor_len), (positive, positive_len), (negative, negative_len) in prog_bar:
    anchor = anchor.to(device)
    positive = positive.to(device)
    negative = negative.to(device)

    num_batches += 1
    
    if model.training:
      optimizer.zero_grad() # clear calculated gradients
      
    with torch.cuda.amp.autocast():
      # slice off special sos token, discount eos token
      # import pdb
      # pdb.set_trace()
      loss = criterion(get_last_element(model(anchor[:, 1:])[0], anchor_len - 2),
                      get_last_element(model(positive[:, 1:])[0], positive_len - 2),
                      get_last_element(model(negative[:, 1:])[0], negative_len - 2))
    
    if model.training:        
      # backprop loss
      scaler.scale(loss).backward()
      scaler.step(optimizer)
      scaler.update()
      
    # accumulate and display stats
    cumulative_stats["loss"] += loss.item()
    
    prog_bar.set_postfix(loss=f'{cumulative_stats["loss"] / num_batches:.6f}')
  
  # average stats across batches
  cumulative_stats["loss"] /= len(dataloader)
  
  return cumulative_stats

In [16]:
from torch import optim
from itertools import chain

num_epochs = 50

if epoch_start == 1:
  # define only at the start of the training
  print("New Model")
  
  regularization = 3e-5
  learning_rate = 1e-3
  criterion = nn.TripletMarginLoss(margin=0.2)
  # criterion = nn.TripletMarginWithDistanceLoss(distance_function=nn.CosineSimilarity(), margin=0.1)
  optimizer = optim.Adam(chain(model.parameters(), criterion.parameters()),
                         lr = learning_rate, weight_decay=regularization)
  scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = 6, gamma = 0.5)
else:
  print("Existing Model")

scaler = torch.cuda.amp.GradScaler() # mix-precision training
model_saver = ModelSaver(model_id, mode="min", regular_save_interval=5)

with open(f"{model_store}/{model_id}/training_params.txt", "w") as file:
  file.write(f"num_epochs = {num_epochs}\n")
  file.write(f"criterion = {criterion}\n")
  file.write(f"optimizer = {optimizer}\n")
  file.write(f"scheduler = {type(scheduler).__name__}({scheduler.state_dict()})\n")

New Model


In [None]:
from tqdm import tqdm
import sys
import json

print(f"Model: {model_id}. Training for {num_epochs} epochs", file=sys.stderr)

for epoch in range(epoch_start, num_epochs + 1):
  print(f"Epoch {epoch}", file=sys.stderr)
  
  # set model in training mode
  model.train()
  train_stats = run_model(model, train_dataloader)
  
  # set model to eval mode
  # model.eval()
  # with torch.no_grad():
  #   eval_stats = run_model(model, validation_dataloader)
    
  # let scheduler know it's the next epoch
#   scheduler.step(eval_stats["accuracy"])
  
  stats = {
    "epoch": epoch,
    "train_stats": train_stats,
    # "eval_stats": eval_stats,
    "learning rate": optimizer.param_groups[0]["lr"]
  }
  
  scheduler.step()
  
  if math.isnan(train_stats["loss"]):
    print("NaN loss detected! Stop Training")
    break
    
  model_saver.save(StoredModel(model, optimizer, scheduler, criterion), stats, train_stats["loss"])

## Extract CLIP text embeddings

In [3]:
import clip
# pip install git+https://github.com/openai/CLIP.git
clip_model, _ = clip.load("ViT-B/32", device=device)

In [7]:
import pandas as pd
def read_texts(split, transcript_base):
  split = pd.read_csv(split)
  text_list = list()
  for _, (cat, vid) in split.iterrows():
    with open(f"{transcript_base}/{cat}/{vid}.txt") as file:
      text = file.readline().strip()
      text_list.append(text)
  return text_list

In [8]:
train_text = read_texts("subset_splits/train_set.csv", "/home/ubuntu/data/transcriptions")
val_text = read_texts("subset_splits/val_set.csv", "/home/ubuntu/data/transcriptions")
test_text = read_texts("subset_splits/test_set.csv", "/home/ubuntu/data/transcriptions")

In [20]:
from dataset.triplet_dataset import tokenize
def get_embeddings(text_list, batch_size=512):
  num_batches = (len(text_list) - 1) // batch_size + 1
  embeddings = list()
  with torch.no_grad():
    for i in range(num_batches):
      tokenized_text, _ = tokenize(text_list[i * batch_size: (i + 1) * batch_size])
      embeddings.append(clip_model.encode_text(tokenized_text.to(device)))
      
  return torch.cat(embeddings)

In [31]:
t = get_embeddings(train_text)

In [32]:
torch.save(t, "clip_embeddings/train_text.pth")