In [1]:
# change trained information here
experiment = 'patent_claim_iter26272_as128_scalar1.0_cycle-auto_prenc-start_wsTrue_lg-latent_attn_add_attn_beta1.0_reg-kld_attn_mode-none_ffn_option-parallel_ffn_enc_layer-8_dec_layer-12_zdim-512_optFalse_ftFalse_zrate-0.5_fb-1sd-42_5.24'
latent_size = 512
max_length = 400
batch_size = 40
top_k = 100
top_p = 0.95

In [4]:
import os
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/innovae-revision/innovae-adavae/adavae/src')

!pip install transformers
!pip install tensorboardX ipdb
!nvidia-smi

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m57.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m115.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86

In [5]:
import numpy as np
import pandas as pd
import seaborn as sns

from collections import defaultdict
from scipy.stats.stats import pearsonr
from matplotlib import pyplot as plt

import torch, math, argparse, copy, re
import torch.nn as nn
import torch.nn.functional as F

from tqdm import tqdm
from torch.utils.data import DataLoader
from argparse import ArgumentParser
from transformers import GPT2Tokenizer, GPT2LMHeadModel

from adapters.configuration_gpt2 import GPT2Config
from adapters.vae import GPT2Adapter, AdaVAEModel
from adapters.common import AdapterConfig
from adaVAE import compute_loss
from data import GenerationDataset, DataFrameDataset
from utils import init_para_frompretrained, tokenize, sample_sequence

  from scipy.stats.stats import pearsonr


In [6]:
parser = ArgumentParser()

# Default parameters are set based on single GPU training
parser.add_argument("--seed", type=int, default=42)

## mode options
parser.add_argument('--adapter_size', type=int, default=128,
                    help="Hidden size of GPT2 encoder/decoder adapter")
parser.add_argument('--latent_size', type=int, default=32,
                    help="Hidden size of latent code")
parser.add_argument('--encoder_n_layer', type=int, default=8,
                    help="attention layer number of GPT-2 encoder")
parser.add_argument('--decoder_n_layer', type=int, default=12,
                    help="attention layer number of GPT-2 decoder")
parser.add_argument('--class_num', type=int, default=2,
                    help="class number for controllable generation")
parser.add_argument('--adapter_scalar', type=str, default="1.0",
                    help="adapter scalar")
parser.add_argument('--ffn_option', type=str, default="parallel_ffn",
                    choices=['sequential', 'parallel_attn', 'parallel_ffn', 'pfeiffer'],
                    help="adapter type option")
parser.add_argument('--latent_gen', type=str, default="latent_attn",
                    help="method for encoder to latent space, averaged_attn for average attention from "
                         "TransformerCVAE, linear for taken the first encoder token to a linear like Optimus",
                    choices=['latent_attn', 'averaged_attn', 'linear', 'mean_max_linear'])
parser.add_argument('--attn_mode', type=str, default="none",
                    choices=['prefix', 'adapter', 'lora', 'none'],
                    help="attention transfer type")
parser.add_argument('--reg_loss', type=str, default="kld",
                    choices=['kld', 'adversarial', 'symlog'],
                    help="regularization loss for latent space")

## testing paramters
parser.add_argument('--batch_size', type=int, default=128,
                    help='batch size per GPU. Lists the schedule.')
parser.add_argument('--max_length', type=int, default=30,
                    help='max length of every input sentence')
parser.add_argument('--data-dir', type=str, default='data/optimus_dataset')
parser.add_argument('--out-dir', type=str, default='out')
parser.add_argument('--experiment', type=str, help="ckpt dirctory", default='out')
parser.add_argument('--adapter_init', type=str, default='bert', choices=['lora', 'bert', 'lisa', 'other'],
                    help="parameter initialization method for adapter layers.")
parser.add_argument('--workers', default=2, type=int, metavar='N',  help='number of data loading workers')
parser.add_argument("--total_sents", default=10, type=int, help="Total sentences to test recontruction/generation.")
parser.add_argument("--max_test_batch", default=10, type=int, help="Total sentence pairs to test interpolation/analogy.")
parser.add_argument("--num_interpolation_step", default=10, type=int)
parser.add_argument("--degree_to_target", type=float, default=1.0)
parser.add_argument("--max_val_batches", type=int, help="Max batch size number to test recontruction.", default=30)
parser.add_argument("--latest_date", type=str, help="Latest date for model testing.", default="")

## metrics
parser.add_argument('--au_delta', type=float, default=0.01,
                    help="threshold for activated unit calculation.")

# use GPU
parser.add_argument('--gpu', default=0, type=int)
parser.add_argument('--no_gpu', action="store_true")


# KL cost annealing, increase beta from beta_0 to 1 in beta_warmup steps
parser.add_argument('--beta_0', default=1.00, type=float)
parser.add_argument('--beta_warmup', type=int, default=1000)
parser.add_argument('--kl_rate', type=float, default=0.0)

# cyc_vae parameters
parser.add_argument('--cycle', type=int, default=2000)

## trigger
parser.add_argument('--load', action="store_true")
parser.add_argument('--save_all', action="store_true", help="save full parameters of the model")
parser.add_argument('--add_input', action="store_true")
parser.add_argument('--add_attn', action="store_true")
parser.add_argument('--add_softmax', action="store_true")
parser.add_argument('--add_mem', action="store_true")
parser.add_argument('--attn_proj_vary', action="store_true")
parser.add_argument('--finetune_enc', action="store_true")
parser.add_argument('--finetune_dec', action="store_true")
parser.add_argument('--weighted_sample', action="store_true")
parser.add_argument('--add_z2adapters', action="store_true")
parser.add_argument('--learn_prior', action="store_true")
parser.add_argument('--test_model', action="store_true")
parser.add_argument('--do_sample', action="store_true", help="sample for reconstruction")

args = parser.parse_args(f'--add_attn --weighted_sample --latent_size {latent_size} --max_length {max_length} --batch_size {batch_size} --experiment {experiment}'.split())

In [7]:
# Set random seed
gpu = torch.cuda.is_available()
np.random.seed(args.seed)
prng = np.random.RandomState()
torch.random.manual_seed(args.seed)

if gpu:
    print("There are ", torch.cuda.device_count(), " available GPUs!")
    torch.cuda.set_device(args.gpu)
    torch.cuda.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    print('Current single GPU: {}'.format(torch.cuda.current_device()))
device = torch.device(args.gpu if torch.cuda.is_available() else "cpu")

# Load model and trained weights
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

config = GPT2Config()
ada_config = AdapterConfig(hidden_size=768,
                            adapter_size=args.adapter_size,
                            adapter_act='relu',
                            adapter_initializer_range=1e-2,
                            latent_size=args.latent_size,
                            class_num=args.class_num,
                            encoder_n_layer=args.encoder_n_layer,
                            decoder_n_layer=args.decoder_n_layer,
                            dis_emb=128,
                            init='other',
                            adapter_scalar=args.adapter_scalar,
                            ffn_option=args.ffn_option,
                            attn_mode=args.attn_mode,
                            latent_gen=args.latent_gen,
                            attn_option='none',
                            mid_dim=30,
                            attn_bn=25,
                            prefix_dropout=0.1,
                            tune_enc=False,
                            tune_dec=False,
                            add_z2adapters=args.add_z2adapters)

AdaVAE = AdaVAEModel(config, ada_config, add_input=args.add_input, add_attn=args.add_attn, add_softmax=args.add_softmax, add_mem=args.add_mem,
                attn_proj_vary=args.attn_proj_vary, learn_prior=args.learn_prior, reg_loss=args.reg_loss)

## load pre-trained weights
init_para_frompretrained(AdaVAE.transformer, gpt2_model.transformer, share_para=False)
init_para_frompretrained(AdaVAE.encoder, gpt2_model.transformer, share_para=False)
AdaVAE.lm_head.weight = gpt2_model.lm_head.weight
AdaVAE.eval()

## load trained parameters
print('Loading model weights...')
state = torch.load(os.path.join("./out", args.experiment, 'model_best_val.pt'))  # , map_location='cpu' model_latest.pt
if 'module' in list(state.keys())[0]:  # model_path is data parallel model with attr 'module'
    keys = copy.copy(state).keys()
    for k in keys:
        state[k.replace('module.', '')] = state.pop(k)

if not args.save_all:
    model_dict = AdaVAE.state_dict()
    additional_dict = {k: v for k, v in state.items() if k in model_dict}
    model_dict.update(additional_dict)
    AdaVAE.load_state_dict(model_dict)
else:
    AdaVAE.load_state_dict(state)
AdaVAE = AdaVAE.to(device)

There are  1  available GPUs!
Current single GPU: 0


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Loading model weights...


In [8]:
doc_path = '/content/drive/MyDrive/innovae-revision/innovae-adavae/adavae/data/optimus_dataset/Patent_claim/ai-patents-date-cpc-text.csv'

df_sample = pd.read_csv(doc_path)
eval_dataloader = DataLoader(
    DataFrameDataset(df_sample),
    batch_size=args.batch_size,
    pin_memory=True,
    drop_last=True,
    num_workers=args.workers,
    shuffle = False)

In [None]:
#if True:
all = []
for i, batch in enumerate(tqdm(eval_dataloader, desc="Reconstructing Documents:")):
        with torch.no_grad():
            x_ids, input_ids, attention_mask = tokenize(batch['text'], tokenizer, device, args)
            doc_ids = (batch['patent_id'],batch['primary_claim'],batch['title'],batch['cpc_subclass'],batch['grant_date'],batch['priority_date'])
            outputs = AdaVAE(input_ids=input_ids, attention_mask=attention_mask, from_mean=True,doc_ids = doc_ids,get_z_only = True)
            latent_z = outputs[-3]
            latent_var = outputs[-2]

            #features
            patent_id = outputs[-1][0].cpu().numpy().tolist()
            primary_claim = outputs[-1][1]
            title = outputs[-1][2]
            cpc_class = outputs[-1][3]
            grant_date =  outputs[-1][4].cpu().numpy().tolist()
            priority_date = outputs[-1][5].cpu().numpy().tolist()


            df_batch = pd.DataFrame({'patent_id': patent_id,
                                     'primary_claim':primary_claim,
                                     'title':title,
                                     'cpc_class':cpc_class,
                                     'grant_date':grant_date,
                                     'priority_date': priority_date,
                                     'latent_z': latent_z.tolist(),
                                     'latent_var':latent_var.tolist()})
            all.append(df_batch)

In [10]:
data = pd.concat(all)
df_latent_z = data['latent_z'].apply(pd.Series)
df_latent_z = df_latent_z.rename(columns = lambda x : 'latent_z_' + str(x))
df_latent_var = data['latent_var'].apply(pd.Series)
df_latent_var = df_latent_var.rename(columns = lambda x : 'latent_var_' + str(x))
dat = data[['patent_id', 'primary_claim', 'title', 'cpc_class', 'grant_date','priority_date']]
final = pd.concat([dat,df_latent_z,df_latent_var],axis = 1)
#final.to_parquet('/content/drive/MyDrive/innovae-revision/processed_512_1epoch.parquet',index = False)

In [12]:
df_sample = pd.read_csv(doc_path)

val_idx = int(len(df_sample)*0.9)
test_idx = int(len(df_sample)*0.95)

df_train = df_sample.iloc[:val_idx,].sample(20000,random_state=0)
df_val = df_sample.iloc[val_idx:test_idx,].sample(10000,random_state=0)
df_test = df_sample.iloc[test_idx:,].sample(10000,random_state=0)

train_dataloader = DataLoader(
    DataFrameDataset(df_train),
    batch_size=args.batch_size,
    pin_memory=True,
    drop_last=True,
    num_workers=args.workers,
    shuffle = False)

val_dataloader = DataLoader(
    DataFrameDataset(df_val),
    batch_size=args.batch_size,
    pin_memory=True,
    drop_last=True,
    num_workers=args.workers,
    shuffle = False)

test_dataloader = DataLoader(
    DataFrameDataset(df_test),
    batch_size=args.batch_size,
    pin_memory=True,
    drop_last=True,
    num_workers=args.workers,
    shuffle = False)

In [14]:
endoftext = tokenizer.convert_tokens_to_ids("<|endoftext|>")

def decode_sent(sent):
    sent = sent[sent.index(endoftext) + 1:]
    if endoftext in sent:
        sent = sent[:sent.index(endoftext)]
    return tokenizer.decode(sent).strip()

if True:
    all = []
    for i, batch in enumerate(tqdm(train_dataloader, desc="Encoding and Reconstructing all Documents:")):
        with torch.no_grad():
            x_ids, input_ids, attention_mask = tokenize(batch['text'], tokenizer, device, args)
            doc_ids = (batch['patent_id'],batch['primary_claim'],batch['title'],batch['cpc_subclass'],batch['grant_date'],batch['priority_date'])
            outputs = AdaVAE(input_ids=input_ids, attention_mask=attention_mask, from_mean=True,doc_ids = doc_ids,get_z_only = True)
            latent_z = outputs[-2]

            #features
            patent_id = outputs[-1][0].cpu().numpy().tolist()
            primary_claim = outputs[-1][1]
            title = outputs[-1][2]
            cpc_class = outputs[-1][3]
            grant_date =  outputs[-1][4].cpu().numpy().tolist()
            priority_date = outputs[-1][5].cpu().numpy().tolist()

            sents, _ = sample_sequence(AdaVAE, args.max_length, z=latent_z, batch_size=args.batch_size, top_k=top_k, top_p=top_p,
                                        device=device, sample=True, eos_token=endoftext)

            # Sample sentences
            sents = sents.tolist()
            decoded = [decode_sent(sent) for sent in sents]

            df_batch = pd.DataFrame({'patent_id': patent_id,
                                     'primary_claim':primary_claim,
                                     'title':title,
                                     'cpc_class':cpc_class,
                                     'grant_date':grant_date,
                                     'priority_date': priority_date,
                                     'latent_z': latent_z.tolist(),
                                     'decoded':decoded})
            all.append(df_batch)

Encoding and Reconstructing all Documents::   0%|          | 0/500 [00:09<?, ?it/s]


KeyboardInterrupt: ignored

In [None]:
pd.concat(all).to_csv('/content/drive/MyDrive/innovae-revision/Processed_512_2epoch.csv',index = False)