In [71]:
#!pip install -Uq fastai



In [18]:
from tqdm import tqdm
import fastai
fastai.__version__

'2.6.3'

In [19]:
from fastai.basics import *
from fastai.callback.all import *
from fastai.text.all import *

In [42]:
import pandas as pd
import torch
pd.options.display.max_colwidth = 100
pd.set_option('display.max_colwidth', None)

In [3]:
df_train = pd.read_json("18k_query_lm_recall_train.json")
df_val = pd.read_json("18k_query_lm_recall_val.json")
df_test = pd.read_json("18k_query_lm_recall_test.json")

In [4]:
len(df_train)

70928

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


In [8]:
from transformers import BertTokenizer, BertModel, BertTokenizerFast
import torch
tokenizer_bert = BertTokenizer.from_pretrained("bert-base-cased")
model_bert = BertModel.from_pretrained("bert-base-cased").to(device)
for param in model_bert.parameters():
    param.requires_grad = False

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
# drop samples that exceed 512 after tokenization for bert 
df_train["bert_token_length"] = df_train.apply(lambda x: len(tokenizer_bert(x["context"]).input_ids), axis=1)

Token indices sequence length is longer than the specified maximum sequence length for this model (515 > 512). Running this sequence through the model will result in indexing errors


In [10]:
df_train_dropped = df_train[df_train["bert_token_length"]<=512]

In [11]:
# drop samples that exceed 512 after tokenization for bert 
df_val["bert_token_length"] = df_val.apply(lambda x: len(tokenizer_bert(x["context"]).input_ids), axis=1)

In [12]:
df_val_dropped = df_val[df_val["bert_token_length"]<=512]

In [13]:
# drop samples that exceed 512 after tokenization for bert 
df_test["bert_token_length"] = df_test.apply(lambda x: len(tokenizer_bert(x["context"]).input_ids), axis=1)

In [14]:
df_test_dropped = df_test[df_test["bert_token_length"]<=512]

In [15]:
df_train_dropped.to_json("train_after_bert_tokeized.json")
df_val_dropped.to_json("val_after_bert_tokeized.json")
df_test_dropped.to_json("test_after_bert_tokeized.json")

In [17]:
print(len(df_train_dropped), len(df_train_dropped["query"].unique()))
print(len(df_val_dropped), len(df_val_dropped["query"].unique()))
print(len(df_test_dropped), len(df_test_dropped["query"].unique()))

70890 10557
18254 2640
38594 5657


# FastAI AWD-LSTM Tokenization Filtration

In [20]:
#https://github.com/fastai/fastai2/blob/master/nbs/39_tutorial.transformers.ipynb
path = untar_data(URLs.WIKITEXT)

In [21]:
fastai_train = pd.read_csv(path/'train.csv', header=None)
fastai_valid = pd.read_csv(path/'test.csv', header=None)
df_all = pd.concat([fastai_train, fastai_valid])

In [22]:
splits = [list(range_of(fastai_train)), list(range(len(fastai_train), len(df_all)))]
tfms = [attrgetter("text"), Tokenizer.from_df(0), Numericalize()]
dsets = Datasets(df_all, [tfms], splits=splits, dl_type=LMDataLoader)

In [23]:
bs,sl = 5,512
dls = dsets.dataloaders(bs=bs, seq_len=sl)

In [24]:
lm = language_model_learner(dls, AWD_LSTM, drop_mult=0.5, metrics=Perplexity(), pretrained=True)

In [25]:
lm.validate()

(#2) [3.2393643856048584,25.517498016357422]

In [52]:
def get_lstm_context_len(df):
    lstm_context_len = []
    for c in tqdm(df["context"]):
        lstm_context_len.append(len(dsets.numericalize(dsets.tokenizer(c))))
    return lstm_context_len


In [53]:
train_lstm_context_len = get_lstm_context_len(df_train_dropped)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 70890/70890 [04:28<00:00, 263.54it/s]


In [54]:
val_lstm_context_len = get_lstm_context_len(df_val_dropped)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18254/18254 [01:07<00:00, 268.99it/s]


In [55]:
test_lstm_context_len = get_lstm_context_len(df_test_dropped)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 38594/38594 [02:23<00:00, 268.66it/s]


In [58]:
len(train_lstm_context_len)

70890

In [85]:
df_train_dropped = df_train_dropped.assign(lstm_token_length=train_lstm_context_len)


In [87]:
df_val_dropped = df_val_dropped.assign(lstm_token_length=val_lstm_context_len)
df_test_dropped = df_test_dropped.assign(lstm_token_length=test_lstm_context_len)

In [88]:
df_train_final = df_train_dropped[df_train_dropped["lstm_token_length"]<=512]

In [89]:
df_val_final = df_val_dropped[df_val_dropped["lstm_token_length"]<=512]
df_test_final = df_test_dropped[df_test_dropped["lstm_token_length"]<=512]

In [90]:
print(len(df_train_final), len(df_train_final["query"].unique()))
print(len(df_val_final), len(df_val_final["query"].unique()))
print(len(df_test_final), len(df_test_final["query"].unique()))

70749 10542
18223 2636
38513 5653


In [91]:
# save the final dataset 
df_train_final.to_json("train_final.json")
df_val_final.to_json("val_final.json")
df_test_final.to_json("test_final.json")