<a href="https://colab.research.google.com/github/NirantK/Hinglish/blob/qrnn/HinglishULMFiT_AWD_QRNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# !pip install gdown
# !pip install ninja
# !pip install sentencepiece
# !pip install tqdm --upgrade --force

In [0]:
from fastai.text import *
from datetime import datetime
import sentencepiece as spm
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import gdown

tqdm.pandas()
data_folder = Path("drive/My Drive/Hinglish/big")

In [0]:
sp = spm.SentencePieceProcessor()
sp.Load(str(data_folder / "hinglish_sp.model"))


class SpTokenizer(BaseTokenizer):
    def __init__(self, lang: str, vocab_size: int = 8000):
        self.lang = lang
        self.vocab_size = vocab_size
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(str(data_folder / "hinglish_sp.model"))
        self.vocab = Vocab([self.sp.IdToPiece(int(i)) for i in range(self.vocab_size)])

    def tokenizer(self, t: str) -> List[str]:
        return self.sp.EncodeAsPieces(t)

    def detokenizer(self, t: List[str]) -> str:
        return self.sp.DecodePieces(t)

In [0]:
def train(
    learn,
    model_name: str,
    lr=2e-03,
    loops=5,
    num_epocs_per_loop=2,
    encoder=False,
    test=False,
):
    learn.fit_one_cycle(1, slice(lr))
    learn.unfreeze()
    if test:
        learn.save(f"{model_name}_test")
        return
    for i in range(loops):
        learn.fit_one_cycle(num_epocs_per_loop, slice(lr))
        learn.save(f"{model_name}_{i}")
        if encoder:
            learn.save_encoder(f"{model_name}_enc_{i}")
        print(f"saved {model_name}_{i}")
        learn.recorder.plot_losses()
        learn.recorder.plot_metrics()

In [0]:
data_lm = load_data(path=data_folder, file="clean_lm.pkl")
data_cls_lm = load_data(path=data_folder, file="clean_cls_lm.pkl")
data_cls = load_data(path=data_folder, file="clean_cls.pkl")

In [0]:
awd_lstm_lm_config = dict(
    emb_sz=400,
    n_hid=1150,
    n_layers=3,
    pad_token=1,
    qrnn=True,
    output_p=0.25,
    hidden_p=0.1,
    input_p=0.2,
    embed_p=0.02,
    weight_p=0.15,
    tie_weights=True,
    out_bias=True,
)
learn = language_model_learner(
    data_lm,
    arch=AWD_LSTM,
    config=awd_lstm_lm_config,
    drop_mult=0.5,
    metrics=[accuracy, Perplexity()],
    pretrained=False,
)

In [0]:
learn.lr_find()
learn.recorder.plot(suggestion=True)

In [0]:
train(
    learn=learn,
    model_name="language_model",
    lr=2e-03,
    loops=5,
    num_epocs_per_loop=2,
    encoder=False,
    test=False,
)

In [0]:
awd_lstm_lm_config = dict(
    emb_sz=400,
    n_hid=1150,
    n_layers=3,
    pad_token=1,
    qrnn=True,
    output_p=0.25,
    hidden_p=0.1,
    input_p=0.2,
    embed_p=0.02,
    weight_p=0.15,
    tie_weights=True,
    out_bias=True,
)
learn = language_model_learner(
    data_cls_lm, arch=AWD_LSTM, config=awd_lstm_lm_config, pretrained=False
)

In [0]:
i = 4
learn.load(f"language_model_{i}")

In [0]:
learn.lr_find()
learn.recorder.plot(suggestion=True)

In [0]:
train(
    learn=learn,
    model_name="language_model_class",
    lr=2e-03,
    loops=10,
    num_epocs_per_loop=2,
    encoder=True,
    test=False,
)

In [0]:
awd_lstm_clas_config = dict(
    emb_sz=400,
    n_hid=1150,
    n_layers=3,
    pad_token=1,
    qrnn=True,
    output_p=0.4,
    hidden_p=0.2,
    input_p=0.6,
    embed_p=0.1,
    weight_p=0.5,
)
learn = text_classifier_learner(
    data_cls,
    AWD_LSTM,
    config=awd_lstm_clas_config,
    drop_mult=0.5,
    metrics=[accuracy],
    pretrained=False,
).to_fp16()

In [0]:
i = 9
learn.load_encoder(f"language_model_class_enc_{i}")
learn.lr_find()
learn.recorder.plot(suggestion=True)

In [0]:
train(
    learn=learn,
    model_name="class_model",
    lr=2e-03,
    loops=10,
    num_epocs_per_loop=10,
    encoder=False,
    test=False,
)