In [1]:
import pandas as pd
pd.options.display.max_colwidth = 100
pd.set_option('display.max_colwidth', None)

In [40]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


In [None]:
#!pip install -Uq fastai

In [2]:
from tqdm import tqdm

In [3]:
import fastai
fastai.__version__

'2.6.3'

In [4]:
from fastai.basics import *
from fastai.callback.all import *
from fastai.text.all import *

In [5]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [6]:
path = untar_data(URLs.WIKITEXT)

In [7]:
df_train = pd.read_csv(path/'train.csv', header=None)
df_valid = pd.read_csv(path/'test.csv', header=None)
df_all = pd.concat([df_train, df_valid])

In [8]:
splits = [list(range_of(df_train)), list(range(len(df_train), len(df_all)))]
tfms = [attrgetter("text"), Tokenizer.from_df(0), Numericalize()]
dsets = Datasets(df_all, [tfms], splits=splits, dl_type=LMDataLoader)

In [9]:
bs,sl = 5,512
dls = dsets.dataloaders(bs=bs, seq_len=sl)

In [10]:
lm = language_model_learner(dls, AWD_LSTM, drop_mult=0.5, metrics=Perplexity(), pretrained=True)

In [19]:
lm.validate()

(#2) [3.2393643856048584,25.517498016357422]

In [41]:
lm.to(device)

SequentialRNN(
  (0): AWD_LSTM(
    (encoder): Embedding(60008, 400, padding_idx=1)
    (encoder_dp): EmbeddingDropout(
      (emb): Embedding(60008, 400, padding_idx=1)
    )
    (rnns): ModuleList(
      (0): WeightDropout(
        (module): LSTM(400, 1152, batch_first=True)
      )
      (1): WeightDropout(
        (module): LSTM(1152, 1152, batch_first=True)
      )
      (2): WeightDropout(
        (module): LSTM(1152, 400, batch_first=True)
      )
    )
    (input_dp): RNNDropout()
    (hidden_dps): ModuleList(
      (0): RNNDropout()
      (1): RNNDropout()
      (2): RNNDropout()
    )
  )
  (1): LinearDecoder(
    (decoder): Linear(in_features=400, out_features=60008, bias=True)
    (output_dp): RNNDropout()
  )
)

In [2]:
df_train_final = pd.read_json("train_final.json")
df_val_final = pd.read_json("val_final.json")
df_test_final = pd.read_json("test_final.json")

In [3]:
print("The size of the lm train dataset is {}".format(len(df_train_final)))
print("The size of the lm val dataset is {}".format(len(df_val_final)))
print("The size of the lm test dataset is {}".format(len(df_test_final)))

The size of the lm train dataset is 70749
The size of the lm val dataset is 18223
The size of the lm test dataset is 38513


In [4]:
df_all_final= pd.concat([df_train_final, df_val_final, df_test_final])

In [8]:
print("positive training examples are {} and negative training examples are {}"
      .format(len(df_all_final[df_all_final['label']==1]), len(df_all_final[df_all_final['label']==0])))

positive training examples are 57348 and negative training examples are 70137


In [12]:
max_seq_len = 512
batch_size = 128

In [34]:
def tokenize_fastai(x, len_df):
    tokenized = np.zeros((len_df, max_seq_len))
    for idx, c in tqdm(enumerate(x)):
        tokens = np.array(dsets.numericalize(dsets.tokenizer(c)).tolist())[:512]
        tokenized[idx, max_seq_len-len(tokens):] = tokens
    
    return tokenized  

In [42]:
def rep_fastai(tokenized, len_df):
    count = 0
    rep_list = []
    while count < len_df:
        rep_list.append(lm.model[0](
            torch.Tensor(tokenized[count:count+batch_size]).long().to(device)).detach().cpu().numpy()[:,-1,:])
        count += batch_size
        #print(count)
        
    return torch.tensor(np.vstack(rep_list))   

In [38]:
tokenized_context_train = tokenize_fastai(df_train_final['context'], len(df_train_final))
tokenized_query_train= tokenize_fastai(df_train_final['query'], len(df_train_final))

tokenized_context_val = tokenize_fastai(df_val_final['context'], len(df_val_final))
tokenized_query_val= tokenize_fastai(df_val_final['query'], len(df_val_final))

tokenized_context_test = tokenize_fastai(df_test_final['context'], len(df_test_final))
tokenized_query_test = tokenize_fastai(df_test_final['query'], len(df_test_final))

70749it [04:18, 273.38it/s]
70749it [00:25, 2788.64it/s]
18223it [01:06, 273.24it/s]
18223it [00:06, 2921.08it/s]
38513it [02:20, 273.61it/s]
38513it [00:13, 2841.38it/s]


In [43]:
X_context_train = rep_fastai(tokenized_context_train, len(df_train_final))
X_query_train = rep_fastai(tokenized_query_train, len(df_train_final))

X_context_val = rep_fastai(tokenized_context_val, len(df_val_final))
X_query_val = rep_fastai(tokenized_query_val, len(df_val_final))

X_context_test = rep_fastai(tokenized_context_test, len(df_test_final))
X_query_test = rep_fastai(tokenized_query_test, len(df_test_final))

In [51]:
X_query_test.shape

torch.Size([38513, 400])

In [52]:
torch.save(X_context_train, 'train_context_lstm_reps.pt')
torch.save(X_context_val, 'val_context_lstm_reps.pt')
torch.save(X_context_test, 'test_context_lstm_reps.pt')

In [53]:
torch.save(X_query_train, 'train_query_lstm_reps.pt')
torch.save(X_query_val, 'val_query_lstm_reps.pt')
torch.save(X_query_test, 'test_query_lstm_reps.pt')

# suprisal

In [31]:
import torch.nn.functional as F

In [71]:
from tqdm import tqdm

In [35]:
tokenized_context_test = tokenize_fastai(df_test_final['context'], len(df_test_final))
tokenized_query_test = tokenize_fastai(df_test_final['query'], len(df_test_final))

38513it [02:07, 301.40it/s]
38513it [00:12, 2983.38it/s]


In [72]:
def surp_fastai(tokenized, query_id=None):
    count = 0
    surp = np.zeros(len(tokenized))
    
    with tqdm(total=len(tokenized)) as pbar:
        while count < len(tokenized):
            log_prob = -F.log_softmax(
                lm.model(
                torch.Tensor(tokenized[count:count+batch_size]).long().to(device))[0][:,-1].detach().cpu(),
                dim=1).gather(dim=1, index=torch.Tensor(query_id[count:count+batch_size]).long().view(-1,1))
            #print(log_prob)
            surp[count:count+batch_size] = log_prob.view(-1).numpy()
            count += batch_size
            pbar.update(batch_size)
        
    return surp

In [73]:
surp_lstm_test = surp_fastai(tokenized_context_test, tokenized_query_test[:,-1])

38528it [00:48, 797.65it/s]                                                                                                                                                                               


In [74]:
torch.save(surp_lstm_test, 'surp_lstm_test.pt')