#Initiate Pre-trained AdaVAE Model

In [None]:
!pip install transformers
!pip install tensorboardX ipdb
!nvidia-smi
!pip install sentence-transformers

# change trained information here
experiment = 'patent_claim_iter26272_as128_scalar1.0_cycle-auto_prenc-start_wsTrue_lg-latent_attn_add_attn_beta1.0_reg-kld_attn_mode-none_ffn_option-parallel_ffn_enc_layer-8_dec_layer-12_zdim-512_optFalse_ftFalse_zrate-0.5_fb-1sd-42_5.24'
latent_size = 512
max_length = 400
batch_size = 40
top_k = 0
top_p = 0

Tue Aug 15 21:16:29 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   30C    P0    49W / 400W |   2251MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [1]:
import os
from google.colab import drive
drive.mount('/content/drive', force_remount= True)
os.chdir('/content/drive/MyDrive/innovae-revision/innovae-adavae/adavae/src')

import numpy as np
import pandas as pd
import seaborn as sns

import collections
from collections import defaultdict
from scipy.stats.stats import pearsonr
from matplotlib import pyplot as plt

import torch, math, argparse, copy, re
import torch.nn as nn
import torch.nn.functional as F

from tqdm import tqdm
from torch.utils.data import DataLoader
from argparse import ArgumentParser
from transformers import GPT2Tokenizer, GPT2LMHeadModel

from adapters.configuration_gpt2 import GPT2Config
from adapters.vae import GPT2Adapter, AdaVAEModel
from adapters.common import AdapterConfig
from adaVAE import compute_loss
from data import GenerationDataset, DataFrameDataset
from utils import init_para_frompretrained, tokenize, sample_sequence

parser = ArgumentParser()

# Default parameters are set based on single GPU training
parser.add_argument("--seed", type=int, default=42)

## mode options
parser.add_argument('--adapter_size', type=int, default=128,
                    help="Hidden size of GPT2 encoder/decoder adapter")
parser.add_argument('--latent_size', type=int, default=32,
                    help="Hidden size of latent code")
parser.add_argument('--encoder_n_layer', type=int, default=8,
                    help="attention layer number of GPT-2 encoder")
parser.add_argument('--decoder_n_layer', type=int, default=12,
                    help="attention layer number of GPT-2 decoder")
parser.add_argument('--class_num', type=int, default=2,
                    help="class number for controllable generation")
parser.add_argument('--adapter_scalar', type=str, default="1.0",
                    help="adapter scalar")
parser.add_argument('--ffn_option', type=str, default="parallel_ffn",
                    choices=['sequential', 'parallel_attn', 'parallel_ffn', 'pfeiffer'],
                    help="adapter type option")
parser.add_argument('--latent_gen', type=str, default="latent_attn",
                    help="method for encoder to latent space, averaged_attn for average attention from "
                         "TransformerCVAE, linear for taken the first encoder token to a linear like Optimus",
                    choices=['latent_attn', 'averaged_attn', 'linear', 'mean_max_linear'])
parser.add_argument('--attn_mode', type=str, default="none",
                    choices=['prefix', 'adapter', 'lora', 'none'],
                    help="attention transfer type")
parser.add_argument('--reg_loss', type=str, default="kld",
                    choices=['kld', 'adversarial', 'symlog'],
                    help="regularization loss for latent space")

## testing paramters
parser.add_argument('--batch_size', type=int, default=128,
                    help='batch size per GPU. Lists the schedule.')
parser.add_argument('--max_length', type=int, default=30,
                    help='max length of every input sentence')
parser.add_argument('--data-dir', type=str, default='data/optimus_dataset')
parser.add_argument('--out-dir', type=str, default='out')
parser.add_argument('--experiment', type=str, help="ckpt dirctory", default='out')
parser.add_argument('--adapter_init', type=str, default='bert', choices=['lora', 'bert', 'lisa', 'other'],
                    help="parameter initialization method for adapter layers.")
parser.add_argument('--workers', default=2, type=int, metavar='N',  help='number of data loading workers')
parser.add_argument("--total_sents", default=10, type=int, help="Total sentences to test recontruction/generation.")
parser.add_argument("--max_test_batch", default=10, type=int, help="Total sentence pairs to test interpolation/analogy.")
parser.add_argument("--num_interpolation_step", default=10, type=int)
parser.add_argument("--degree_to_target", type=float, default=1.0)
parser.add_argument("--max_val_batches", type=int, help="Max batch size number to test recontruction.", default=30)
parser.add_argument("--latest_date", type=str, help="Latest date for model testing.", default="")

## metrics
parser.add_argument('--au_delta', type=float, default=0.01,
                    help="threshold for activated unit calculation.")

# use GPU
parser.add_argument('--gpu', default=0, type=int)
parser.add_argument('--no_gpu', action="store_true")


# KL cost annealing, increase beta from beta_0 to 1 in beta_warmup steps
parser.add_argument('--beta_0', default=1.00, type=float)
parser.add_argument('--beta_warmup', type=int, default=1000)
parser.add_argument('--kl_rate', type=float, default=0.0)

# cyc_vae parameters
parser.add_argument('--cycle', type=int, default=2000)

## trigger
parser.add_argument('--load', action="store_true")
parser.add_argument('--save_all', action="store_true", help="save full parameters of the model")
parser.add_argument('--add_input', action="store_true")
parser.add_argument('--add_attn', action="store_true")
parser.add_argument('--add_softmax', action="store_true")
parser.add_argument('--add_mem', action="store_true")
parser.add_argument('--attn_proj_vary', action="store_true")
parser.add_argument('--finetune_enc', action="store_true")
parser.add_argument('--finetune_dec', action="store_true")
parser.add_argument('--weighted_sample', action="store_true")
parser.add_argument('--add_z2adapters', action="store_true")
parser.add_argument('--learn_prior', action="store_true")
parser.add_argument('--test_model', action="store_true")
parser.add_argument('--do_sample', action="store_true", help="sample for reconstruction")

args = parser.parse_args(f'--add_attn --weighted_sample --latent_size {latent_size} --max_length {max_length} --batch_size {batch_size} --experiment {experiment}'.split())

# Set random seed
gpu = torch.cuda.is_available()
np.random.seed(args.seed)
prng = np.random.RandomState()
torch.random.manual_seed(args.seed)

if gpu:
    print("There are ", torch.cuda.device_count(), " available GPUs!")
    torch.cuda.set_device(args.gpu)
    torch.cuda.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    print('Current single GPU: {}'.format(torch.cuda.current_device()))
device = torch.device(args.gpu if torch.cuda.is_available() else "cpu")

# Load model and trained weights
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

config = GPT2Config()
ada_config = AdapterConfig(hidden_size=768,
                            adapter_size=args.adapter_size,
                            adapter_act='relu',
                            adapter_initializer_range=1e-2,
                            latent_size=args.latent_size,
                            class_num=args.class_num,
                            encoder_n_layer=args.encoder_n_layer,
                            decoder_n_layer=args.decoder_n_layer,
                            dis_emb=128,
                            init='other',
                            adapter_scalar=args.adapter_scalar,
                            ffn_option=args.ffn_option,
                            attn_mode=args.attn_mode,
                            latent_gen=args.latent_gen,
                            attn_option='none',
                            mid_dim=30,
                            attn_bn=25,
                            prefix_dropout=0.1,
                            tune_enc=False,
                            tune_dec=False,
                            add_z2adapters=args.add_z2adapters)

AdaVAE = AdaVAEModel(config, ada_config, add_input=args.add_input, add_attn=args.add_attn, add_softmax=args.add_softmax, add_mem=args.add_mem,
                attn_proj_vary=args.attn_proj_vary, learn_prior=args.learn_prior, reg_loss=args.reg_loss)

## load pre-trained weights
init_para_frompretrained(AdaVAE.transformer, gpt2_model.transformer, share_para=False)
init_para_frompretrained(AdaVAE.encoder, gpt2_model.transformer, share_para=False)
AdaVAE.lm_head.weight = gpt2_model.lm_head.weight
AdaVAE.eval()

## load trained parameters
print('Loading model weights...')
state = torch.load(os.path.join("./out", args.experiment, 'model_best_val.pt'))
if 'module' in list(state.keys())[0]:  # model_path is data parallel model with attr 'module'
    keys = copy.copy(state).keys()
    for k in keys:
        state[k.replace('module.', '')] = state.pop(k)

if not args.save_all:
    model_dict = AdaVAE.state_dict()
    additional_dict = {k: v for k, v in state.items() if k in model_dict}
    model_dict.update(additional_dict)
    AdaVAE.load_state_dict(model_dict)
else:
    AdaVAE.load_state_dict(state)

AdaVAE = AdaVAE.to(device)

Mounted at /content/drive


  from scipy.stats.stats import pearsonr


ModuleNotFoundError: ignored

#Vectorized Database Utils

In [None]:
#training utils
from torch.utils.data import DataLoader
from sentence_transformers import losses, util,models
from sentence_transformers import LoggingHandler, SentenceTransformer, evaluation
from sentence_transformers.readers import InputExample
import logging
from datetime import datetime
import csv
import os
import random
import pandas as pd
from sentence_transformers import util

import torch
from torch import nn, Tensor
from typing import Iterable, Dict
from sentence_transformers import util
from sentence_transformers.util import batch_to_device

#model utils
import json
import logging
import os
import shutil
import stat
from collections import OrderedDict
from typing import List, Dict, Tuple, Iterable, Type, Union, Callable, Optional
import requests
import numpy as np
from numpy import ndarray
import transformers
from huggingface_hub import HfApi, HfFolder, Repository, hf_hub_url, cached_download
import torch
from torch import nn, Tensor, device
from torch.optim import Optimizer
from torch.utils.data import DataLoader
import torch.multiprocessing as mp
from tqdm.autonotebook import trange
import math
import queue
import tempfile
from distutils.dir_util import copy_tree

##finetuning utils from sentence transformer

In [None]:
def smart_batching_collate(batch):
    """
    Transforms a batch from a SmartBatchingDataset to a batch of tensors for the model
    Here, batch is a list of tuples: [(tokens, label), ...]
    """
    num_texts = len(batch[0].texts)
    texts = [[] for _ in range(num_texts)]
    labels = []
    for example in batch:
      for idx, text in enumerate(example.texts):
        texts[idx].append(text)
        labels.append(example.label)
    labels = torch.tensor(labels)
    sentence_features = []
    for idx in range(num_texts):
      tokenized = tokenizer(texts[idx],return_tensors = 'pt',padding = True, truncation = True, max_length = args.max_length)
      sentence_features.append(tokenized)
    return sentence_features, labels

class MultipleNegativesRankingLoss(nn.Module):
    def __init__(self, model, scale = 20.0, similarity_fct = util.cos_sim):
        """
        :param model: SentenceTransformer model
        :param scale: Output of similarity function is multiplied by scale value
        :param similarity_fct: similarity function between sentence embeddings. By default, cos_sim. Can also be set to dot product (and then set scale to 1)
        """
        super(MultipleNegativesRankingLoss, self).__init__()
        self.model = model
        self.scale = scale
        self.similarity_fct = similarity_fct
        self.cross_entropy_loss = nn.CrossEntropyLoss()


    def forward(self, sentence_features, labels):
        reps = [self.model(**feature,from_mean=True,doc_ids = None,get_z_only = True, concat_z_var = True) for feature in sentence_features]
        embeddings_a = reps[0]
        embeddings_b = torch.cat(reps[1:])


        scores = self.similarity_fct(embeddings_a, embeddings_b) * self.scale
        labels = torch.tensor(range(len(scores)), dtype=torch.long, device=scores.device)  # Example a[i] should match with b[i]
        return self.cross_entropy_loss(scores, labels)

def fit(train_objectives,
        epochs,
        steps_per_epoch = None,
        warmup_steps = 10000,
        optimizer_class = torch.optim.AdamW,
        optimizer_params = {'lr': 5e-5},
        weight_decay = 0.01,
        evaluation_steps = 0,
        max_grad_norm = 1,
        show_progress_bar = True):
        dataloaders = [dataloader for dataloader, _ in train_objectives]

        # Use smart batching
        for dataloader in dataloaders:
            dataloader.collate_fn = smart_batching_collate

        loss_models = [loss for _, loss in train_objectives]
        for loss_model in loss_models:
            loss_model.to('cuda')

        if steps_per_epoch is None or steps_per_epoch == 0:
            steps_per_epoch = min([len(dataloader) for dataloader in dataloaders])

        num_train_steps = int(steps_per_epoch * epochs)

        # Prepare optimizers
        optimizers = []
        schedulers = []
        for loss_model in loss_models:
            param_optimizer = list(loss_model.named_parameters())

            no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.01}
            ]

            optimizer = optimizer_class(optimizer_grouped_parameters, **optimizer_params)
            optimizers.append(optimizer)

        global_step = 0
        data_iterators = [iter(dataloader) for dataloader in dataloaders]

        num_train_objectives = len(train_objectives)

        for epoch in trange(epochs, desc="Epoch", disable=not show_progress_bar):
            training_steps = 0

            for loss_model in loss_models:
                loss_model.zero_grad()
                loss_model.train()

            for _ in trange(steps_per_epoch, desc="Iteration", smoothing=0.05, disable=not show_progress_bar):
                for train_idx in range(num_train_objectives):
                    loss_model = loss_models[train_idx]
                    optimizer = optimizers[train_idx]
                    data_iterator = data_iterators[train_idx]

                    try:
                        data = next(data_iterator)
                    except StopIteration:
                        data_iterator = iter(dataloaders[train_idx])
                        data_iterators[train_idx] = data_iterator
                        data = next(data_iterator)

                    features, labels = data
                    labels = labels.to('cuda')
                    features = list(map(lambda batch: batch_to_device(batch, 'cuda'), features))
                    loss_value = loss_model(features, labels)
                    loss_value.backward()
                    print(f'current loss is {loss_value}')
                    torch.nn.utils.clip_grad_norm_(loss_model.parameters(), max_grad_norm)
                    optimizer.step()
                    optimizer.zero_grad()
                    training_steps += 1
                    global_step += 1

##finetune the innovae encoder for vectorized documents

In [None]:
data_grant = pd.read_csv('/content/drive/MyDrive/innovae-revision/innovae-adavae/data/patent_grant_cleanded_new.csv').sample(frac = 1,random_state = 10).reset_index(drop = True)
data_pregrant = pd.read_csv('/content/drive/MyDrive/innovae-revision/innovae-adavae/data/patent_pregrant_cleanded_new.csv').sample(frac = 1,random_state = 10).reset_index(drop = True)
total_data = pd.concat([data_grant,data_pregrant]).sample(frac = 1,random_state = 10).reset_index(drop = True)
train_data = total_data.iloc[:145000]
test_data = total_data.iloc[145000:]

######### Read train data  ##########
train_samples = []
for idx,row in train_data.iterrows():
  train_samples.append(InputExample(texts=[row['prior_text'], row['new_text']], label=1))
  train_samples.append(InputExample(texts=[row['new_text'], row['prior_text']], label=1))

for param in AdaVAE.parameters():
    param.requires_grad = False

for name, param in AdaVAE.encoder.named_parameters():
  trained = ['wte','wpe','0','1','2','3','4','5','6','7','ln_f','LatentAttention','mean','logvar']

  if any(element in name for element in trained):
     param.requires_grad = True

num_epochs = 1
train_batch_size = 40
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size,drop_last = True)
train_dataloader.collate_fn = smart_batching_collate
train_loss = MultipleNegativesRankingLoss(AdaVAE)

fit(train_objectives=[(train_dataloader, train_loss)],epochs=num_epochs)

save_orderdict = collections.OrderedDict()
for name, parameter in AdaVAE.named_parameters():
  #if parameter.requires_grad:
      save_orderdict[name] = parameter

#torch.save(save_orderdict, os.path.join('/content/drive/MyDrive/AdaVAE_for_Articles/adavae/all_layereencoder_epoch_aligned.pt'))

#Vectorized Database for Patents

##vectorized all patent and store in the database

In [None]:
!pip install chromadb

from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions
import pickle
import re

client = chromadb.Client(Settings(
    chroma_db_impl="duckdb+parquet",
    persist_directory="/content/drive/MyDrive/innovae-revision/innovae-adavae/results/patent_vectorized_database/to/persist/directory" # Optional, defaults to .chromadb/ in the current directory
))

 def emb_fn(text):
  x_ids, input_ids, attention_mask = tokenize(text, tokenizer, 'cuda', args)
  outputs = AdaVAE(input_ids=input_ids, attention_mask=attention_mask, from_mean=True,doc_ids = None,get_z_only = True,concat_z_var = True)
  return outputs.cpu().detach().numpy().tolist()

client.delete_collection(name="patent_collection")
collection = client.create_collection(name="patent_collection",embedding_function=emb_fn, metadata={"hnsw:space": "cosine"})

test_patent = total_data.iloc[50000:][['prior_art_no','prior_art_priority_date','prior_art_subclass_id','prior_text']].dropna()
test_patent['prior_art_priority_date'] = test_patent['prior_art_priority_date'].astype(str)
test_patent = test_patent.drop_duplicates(subset = ['prior_art_no'])
test_patent['prior_art_priority_date'] = [str(i) for i in test_patent['prior_art_priority_date']]

docs = test_patent['prior_text'].tolist()
category = pd.DataFrame([i.split('/') for i in test_patent['prior_art_subclass_id']])
category1 = category[0].tolist()
category2 = category[1].tolist()
date = test_patent['prior_art_priority_date'].tolist()
metas = [{'date':d,'cat1':cat1,'cat2':cat2} for d,cat1,cat2 in zip(date, category1, category2)]
ids = [str(i) for i in test_patent['prior_art_no'].tolist()]

val_dataloader = DataLoader(docs,
                            batch_size=args.batch_size,
                            pin_memory=True,
                            num_workers=args.workers,
                            shuffle = False)

results = []
for i, batch in enumerate(tqdm(val_dataloader, desc="Reconstructing Documents:")):
    with torch.no_grad():
        x_ids, input_ids, attention_mask = tokenize(batch, tokenizer, 'cuda', args)
        outputs = AdaVAE(input_ids= input_ids,
                         attention_mask= attention_mask,
                         from_mean=True,
                         doc_ids = None,
                         get_z_only = True,
                         concat_z_var = True)
        results += outputs.cpu().numpy().tolist()

##test for information retrival accuracy

In [None]:
collection.add(
    documents=docs,
    embeddings = results,
    metadatas=metas,
    ids=ids
)

count = 0
total = 0
for idx,row in new.iterrows():
  cat = row['prior_art_subclass_id'].split('/')[0]
  result = collection.query(
    query_texts=[row['new_text']],
    n_results=30,
    where={"cat1": cat})
  total += 1
  if str(row['prior_art_no']) in result['ids'][0]:
    count += 1
  print(total,count)