In [1]:
import pandas as pd
pd.options.display.max_colwidth = 100
pd.set_option('display.max_colwidth', None)

In [2]:
from tqdm import tqdm

In [3]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import BertModel, BertTokenizer

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


# Natural langugage - load the data


In [3]:

df_train_final = pd.read_json("train_final.json")
df_val_final = pd.read_json("val_final.json")
df_test_final = pd.read_json("test_final.json")

In [11]:
# create a dataset with the dataframe

class CustomDataset(Dataset):

    def __init__(self, dataframe):

        self.context = dataframe.iloc[:,1]
        self.query = dataframe.iloc[:,0]
        self.labels = dataframe.iloc[:, 2]

        #self.x_train=torch.tensor(x,dtype=torch.float32)
        #self.y_train=torch.tensor(y,dtype=torch.float32)

    def __len__(self):
        return len(self.query)
  
    def __getitem__(self, idx):
        #print(self.query[idx], self.labels[idx])
        return self.context.iloc[idx], self.query.iloc[idx], self.labels.iloc[idx]

In [27]:
data_set_train = CustomDataset(df_train_final)
data_set_val = CustomDataset(df_val_final)
data_set_test = CustomDataset(df_test_final)

train_loader = DataLoader(data_set_train,batch_size=128,shuffle=False)
val_loader = DataLoader(data_set_val,batch_size=128,shuffle=False)
test_loader = DataLoader(data_set_test,batch_size=128,shuffle=False) 

# extract features

In [8]:
# load model 

tokenizer_bert = BertTokenizer.from_pretrained("bert-base-cased")
model_bert = BertModel.from_pretrained("bert-base-cased").to(device)
for param in model_bert.parameters():
    param.requires_grad = False

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
def get_rep(r, is_context):
    r_inputs  = tokenizer_bert(r, return_tensors="pt", truncation=True, padding=True).to(device)
    r_op = model_bert(**r_inputs)
    if is_context:
        #CLS token
        token_id = 0
    else:
        # query token
        token_id = 1
    
    representation = r_op.last_hidden_state[:,token_id,:]
    
    return representation

In [21]:
def get_all_reps(data_loader):
    query_reps = []
    context_reps = []

    for c, q, labels in tqdm(data_loader):

        context_reps.append(get_rep(c, is_context=True).cpu())
        query_reps.append(get_rep(q, is_context=False).cpu())
    query_reps = torch.vstack(query_reps)
    context_reps = torch.vstack(context_reps)
    return context_reps, query_reps


In [28]:
train_context_reps, train_query_reps = get_all_reps(train_loader)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 553/553 [08:03<00:00,  1.14it/s]


In [29]:
val_context_reps, val_query_reps = get_all_reps(val_loader)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 143/143 [02:03<00:00,  1.16it/s]


In [30]:
test_context_reps, test_query_reps = get_all_reps(test_loader)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 301/301 [04:12<00:00,  1.19it/s]


In [34]:
torch.save(train_context_reps, 'train_context_bert_reps.pt')
torch.save(val_context_reps, 'val_context_bert_reps.pt')
torch.save(test_context_reps, 'test_context_bert_reps.pt')

In [36]:
torch.save(train_query_reps, 'train_query_bert_reps.pt')
torch.save(val_query_reps, 'val_query_bert_reps.pt')
torch.save(test_query_reps, 'test_query_bert_reps.pt')

In [38]:
torch.save(torch.tensor(df_train_final['label'].values), 'train_label.pt')
torch.save(torch.tensor(df_val_final['label'].values), 'val_label.pt')
torch.save(torch.tensor(df_test_final['label'].values), 'test_label.pt')

In [37]:
train_query_reps.shape

torch.Size([70749, 768])

## Synthetic dataset 

In [5]:
df_train_final = pd.read_json("no_overlap_syn_train.json")
df_val_final = pd.read_json("no_overlap_syn_val.json")
df_test_final = pd.read_json("no_overlap_syn_test.json")

In [39]:
df_train_final.head()

Unnamed: 0,sequence,query,sequence_len,first_rep_pos,label,token_repeated,clean_seq
0,[6],6,1,0,1,6,6
1,[96],52,1,-1,0,-1,96
2,[39],39,1,0,1,39,39
3,[12],18,1,-1,0,-1,12
4,[22],22,1,0,1,22,22


In [46]:
df_train_final['query'] = df_train_final['query'].apply(lambda x: str(x))

In [47]:
df_val_final['query'] = df_val_final['query'].apply(lambda x: str(x))
df_test_final['query'] = df_test_final['query'].apply(lambda x: str(x))

In [48]:
# create a dataset with the dataframe

class CustomDataset(Dataset):

    def __init__(self, dataframe):

        self.context = dataframe.iloc[:,6]
        self.query = dataframe.iloc[:,1]
        self.labels = dataframe.iloc[:, 4]

        #self.x_train=torch.tensor(x,dtype=torch.float32)
        #self.y_train=torch.tensor(y,dtype=torch.float32)

    def __len__(self):
        return len(self.query)
  
    def __getitem__(self, idx):
        #print(self.query[idx], self.labels[idx])
        return self.context.iloc[idx], self.query.iloc[idx], self.labels.iloc[idx]

In [49]:
data_set_train = CustomDataset(df_train_final)
data_set_val = CustomDataset(df_val_final)
data_set_test = CustomDataset(df_test_final)

train_loader = DataLoader(data_set_train,batch_size=128,shuffle=True)
val_loader = DataLoader(data_set_val,batch_size=128,shuffle=True)
test_loader = DataLoader(data_set_test,batch_size=128,shuffle=True) 

In [23]:
tokenizer_bert = BertTokenizer.from_pretrained("bert-base-cased")
model_bert = BertModel.from_pretrained("bert-base-cased").to(device)
for param in model_bert.parameters():
    param.requires_grad = False

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [50]:
def get_rep(r, is_context):
    r_inputs  = tokenizer_bert(r, return_tensors="pt", truncation=True, padding=True, max_length=99).to(device)
    r_op = model_bert(**r_inputs)
    if is_context:
        #CLS token
        token_id = 0
    else:
        # query token
        token_id = 1
    
    representation = r_op.last_hidden_state[:,token_id,:]
    
    return representation

In [51]:
def get_all_reps(data_loader):
    query_reps = []
    context_reps = []

    for c, q, labels in tqdm(data_loader):

        context_reps.append(get_rep(c, is_context=True).cpu())
        query_reps.append(get_rep(q, is_context=False).cpu())
    query_reps = torch.vstack(query_reps)
    context_reps = torch.vstack(context_reps)
    return context_reps, query_reps

In [52]:
train_context_reps, train_query_reps = get_all_reps(train_loader)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1086/1086 [02:35<00:00,  6.98it/s]


In [53]:
val_context_reps, val_query_reps = get_all_reps(val_loader)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 123/123 [00:17<00:00,  6.96it/s]


In [54]:
test_context_reps, test_query_reps = get_all_reps(test_loader)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 305/305 [00:43<00:00,  6.99it/s]


In [55]:
torch.save(train_context_reps, 'syn_no_olp_train_context_bert_reps.pt')
torch.save(val_context_reps, 'syn_no_olp_val_context_bert_reps.pt')
torch.save(test_context_reps, 'syn_no_olp_test_context_bert_reps.pt')

In [56]:
torch.save(train_query_reps, 'syn_no_olp_train_query_bert_reps.pt')
torch.save(val_query_reps, 'syn_no_olp_val_query_bert_reps.pt')
torch.save(test_query_reps, 'syn_no_olp_test_query_bert_reps.pt')

In [57]:
torch.save(torch.tensor(df_train_final['label'].values), 'syn_no_olp_train_label.pt')
torch.save(torch.tensor(df_val_final['label'].values), 'syn_no_olp_val_label.pt')
torch.save(torch.tensor(df_test_final['label'].values), 'syn_no_olp_test_label.pt')