# [ReadNet is SOTA for WeeBit](https://paperswithcode.com/sota/text-classification-on-weebit-readability) (Readability Assessment dataset).
# This code is just a ReadNet implementation for CommonLit competition.

In [None]:
import torch
import numpy as np
from torch import Tensor, nn, tensor
import math
import pandas as pd
import csv
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
from pathlib import Path
from fastai.vision.all import *
from fastai.text.all import *


# Make sure to have your glove embeddings stored here
root_dir = '.'

In [None]:
## MODEL CODE ##


class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, masked):
        super().__init__()
        assert d_model % num_heads == 0, "num_heads must evenly chunk d_model"
        self.num_heads = num_heads
        self.wq = nn.Linear(d_model, d_model, bias=False)  # QQ what if bias=True?
        self.wk = nn.Linear(d_model, d_model, bias=False)
        self.wv = nn.Linear(d_model, d_model, bias=False)
        self.masked = masked
        self.softmax = nn.Softmax(dim=2)

    def forward(self, q, k, v):
        qs = self.wq(q).chunk(self.num_heads, dim=2)
        ks = self.wk(k).chunk(self.num_heads, dim=2)
        vs = self.wv(v).chunk(self.num_heads, dim=2)
        outs = []
        # TODO Use einsum instead of for loop
        for qi, ki, vi in zip(qs, ks, vs):
            attns = qi.bmm(ki.transpose(1, 2)) / (ki.shape[2] ** 0.5)
            if self.masked:
                attns = attns.tril()  # Zero out upper triangle so it can't look ahead
            attns = self.softmax(attns)
            outs.append(attns.bmm(vi))
        return torch.cat(outs, dim=2)

In [None]:
class AddNorm(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.ln = nn.LayerNorm(d_model)

    def forward(self, x1, x2):
        return self.ln(x1+x2)


class FeedForward(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.l1 = nn.Linear(d_model, d_model)
        self.relu = nn.ReLU()
        self.l2 = nn.Linear(d_model, d_model)
    def forward(self, x):
        return self.l2(self.relu(self.l1(x)))


def pos_encode(x):
    pos, dim = torch.meshgrid(torch.arange(x.shape[1]), torch.arange(x.shape[2]))
    dim = 2 * (dim // 2)
    enc_base = pos/(10_000**(dim / x.shape[2]))
    addition = torch.zeros_like(x)
    for d in range(x.shape[2]):
        enc_func = torch.sin if d % 2 == 0 else torch.cos
        addition[:,:,d] = enc_func(enc_base[:,d])
    if x.is_cuda:
        addition = addition.cuda()
    return x + addition

In [None]:
class EncoderBlock(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.mha = MultiHeadAttention(d_model=d_model, num_heads=num_heads, masked=False)
        self.an1 = AddNorm(d_model)
        self.ff = FeedForward(d_model)
        self.an2 = AddNorm(d_model)

    def forward(self, x):
        x = self.an1(x, self.mha(q=x, k=x, v=x))
        return self.an2(x, self.ff(x))


class AttentionAggregation(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.query = nn.Linear(d_model, 1, bias=False)

    def forward(self, x):  # (b, s, m)
        attns = self.query(x).softmax(dim=1)  # (b, s, 1)
        enc = torch.bmm(attns.transpose(1, 2), x)  # (b, 1, m)
        return enc.squeeze(1)


class LinTanh(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.lin = nn.Linear(d_model, d_model)
        self.tanh = nn.Tanh()

    def forward(self, x):
        return self.tanh(self.lin(x))

In [None]:
class LinFeatConcat(nn.Module):
    def __init__(self, d_model, n_feats, n_out):
        super().__init__()
        self.lin = nn.Linear(d_model + n_feats, n_out, bias=False)  # TODO what if True?

    def forward(self, x, feats):
        return self.lin(torch.cat([x, feats], dim=1))


class ReadNetBlock(nn.Module):
    def __init__(self, d_model, n_heads, n_blocks, n_feats, n_out):
        super().__init__()
        self.blocks = nn.Sequential(*[EncoderBlock(d_model=d_model, num_heads=n_heads) for _ in range(n_blocks)])
        self.lin_tanh = LinTanh(d_model=d_model)
        self.attn_agg = AttentionAggregation(d_model=d_model)
        self.lin_feat_concat = LinFeatConcat(d_model=d_model, n_feats=n_feats, n_out=n_out)

    def forward(self, x, feats):  # (b, s, m), (b, f)
        x = pos_encode(x)
        x = self.blocks(x)
        x = self.lin_tanh(x)
        x = self.attn_agg(x)
        return self.lin_feat_concat(x, feats)

In [None]:
class GloveEmbedding(nn.Module):
    def __init__(self, num):
        super().__init__()
        # Make embedding
        self.embed = nn.Embedding(400_000 + 1, num)
        # found GloveEmbedding on kaggle and set here.
        emb_w = pd.read_csv(
            '../input/glove-embeddings/glove.6B.200d.txt', header=None, sep=" ", quoting=csv.QUOTE_NONE
        ).values[:, 1:].astype('float64')
        emb_w = Tensor(emb_w)
        emb_w = torch.cat([emb_w, torch.zeros(1, num)], dim=0)
        self.embed.weight = nn.Parameter(emb_w)

    def forward(self, x):
        return self.embed(x.to(torch.long))

In [None]:
class ReadNet(nn.Module):
    def __init__(self, embed, d_model, n_heads, n_blocks, n_feats_sent, n_feats_doc):
        super().__init__()
        self.embed = embed
        self.sent_block = ReadNetBlock(
            d_model=d_model, n_heads=n_heads, n_blocks=n_blocks, n_feats=n_feats_sent, n_out=d_model
        )
        self.doc_block = ReadNetBlock(
            d_model=d_model, n_heads=n_heads, n_blocks=n_blocks, n_feats=n_feats_doc, n_out=d_model + n_feats_doc
        )
        self.head = nn.Sequential(
            nn.Linear(d_model + n_feats_doc, 1),
        )

    def forward(self, x, feats_sent=None, feats_doc=None):  # (b, d, s) tokens, (b, d, n_f_s), (b, n_f_d)
        if feats_sent is None: feats_sent = Tensor([])
        if feats_doc is None: feats_doc = Tensor([])
        if x.is_cuda:
            feats_sent = feats_sent.cuda()
            feats_doc = feats_doc.cuda()
        x = self.embed(x)
        b, d, s, m = x.shape
        x = x.reshape(b * d, s, m)
        sents_enc = self.sent_block(x, feats_sent.reshape(b * d, -1))  # (b*d, m)
        docs = sents_enc.reshape(b, d, m)
        docs_enc = self.doc_block(docs, feats_doc)
        out = self.head(docs_enc)
        return out.squeeze(1)

In [None]:
## DATA PREPARATION ##

class GloveTokenizer:
    def __init__(self, num):
        # found GloveEmbedding on kaggle and set here.
        words = pd.read_csv(
            '../input/glove-embeddings/glove.6B.200d.txt', header=None, sep=" ", quoting=csv.QUOTE_NONE, usecols=[0]
        ).values
        words = [word[0] for word in words]
        self.word2idx = {w: i for i, w in enumerate(words)}

    def __call__(self, sent):
        toks = [self.word2idx.get(w.lower()) for w in word_tokenize(sent)]
        return [self.unk_token if t is None else t for t in toks]

    @property
    def unk_token(self):
        return 400_000  # We appended this to the end of the embedding to return all zeros

    @property
    def pad_token(self):
        return self.unk_token  # Seems that this is the best option for GLOVE


def prepare_txts(txts, tokenizer):
    # Input: (bs,) str, Output: (bs, max_doc_len, max_sent_len)
    # We choose to elongate all docs and sentences to the max rather than truncate some of them
    # TODO: Do this better later:
    # (1) Truncate smartly (if there is one very long outlier sentence or doc)
    # (2) Group together docs of similar lengths (in terms of num_sents)
    docs = [[tokenizer(sent) for sent in sent_tokenize(txt)] for txt in txts]
    # pkl_save(root_dir/"doc_lens", pd.Series([len(doc) for doc in docs]))
    max_doc_len = max([len(doc) for doc in docs])
    docs = [doc + [[]] * (max_doc_len - len(doc)) for doc in docs]
    # pkl_save(root_dir/"sent_lens", pd.Series([len(sent) for doc in docs for sent in doc]))
    max_sent_len = max([len(sent) for doc in docs for sent in doc])
    docs = [[s + [tokenizer.pad_token] * (max_sent_len - len(s)) for s in doc] for doc in docs]
    return Tensor(docs)


def prepare_txts_cut(txts, tokenizer, max_doc_len=18, max_sent_len=49):
    docs = [[tokenizer(sent)[:max_sent_len] for sent in sent_tokenize(txt)[:max_doc_len]] for txt in txts]
    docs = [doc + [[]] * (max_doc_len - len(doc)) for doc in docs]
    docs = [[s + [tokenizer.pad_token] * (max_sent_len - len(s)) for s in doc] for doc in docs]
    return Tensor(docs)

# Preprocess
Preprocessing improved training results.
below code is from [How To: Preprocessing for GloVe Part2: Usage](https://www.kaggle.com/christofhenkel/how-to-preprocessing-for-glove-part2-usage)

In [None]:
symbols_to_isolate = '.,?!-;*"…:—()%#$&_/@＼・ω+=”“[]^–>\\°<~•≠™ˈʊɒ∞§{}·τα❤☺ɡ|¢→̶`❥━┣┫┗Ｏ►★©―ɪ✔®\x96\x92●£♥➤´¹☕≈÷♡◐║▬′ɔː€۩۞†μ✒➥═☆ˌ◄½ʻπδηλσερνʃ✬ＳＵＰＥＲＩＴ☻±♍µº¾✓◾؟．⬅℅»Вав❣⋅¿¬♫ＣＭβ█▓▒░⇒⭐›¡₂₃❧▰▔◞▀▂▃▄▅▆▇↙γ̄″☹➡«φ⅓„✋：¥̲̅́∙‛◇✏▷❓❗¶˚˙）сиʿ✨。ɑ\x80◕！％¯−ﬂﬁ₁²ʌ¼⁴⁄₄⌠♭✘╪▶☭✭♪☔☠♂☃☎✈✌✰❆☙○‣⚓年∎ℒ▪▙☏⅛ｃａｓǀ℮¸ｗ‚∼‖ℳ❄←☼⋆ʒ⊂、⅔¨͡๏⚾⚽Φ×θ￦？（℃⏩☮⚠月✊❌⭕▸■⇌☐☑⚡☄ǫ╭∩╮，例＞ʕɐ̣Δ₀✞┈╱╲▏▕┃╰▊▋╯┳┊≥☒↑☝ɹ✅☛♩☞ＡＪＢ◔◡↓♀⬆̱ℏ\x91⠀ˤ╚↺⇤∏✾◦♬³の｜／∵∴√Ω¤☜▲↳▫‿⬇✧ｏｖｍ－２０８＇‰≤∕ˆ⚜☁'
symbols_to_delete = '\n🍕\r🐵😑\xa0\ue014\t\uf818\uf04a\xad😢🐶️\uf0e0😜😎👊\u200b\u200e😁عدويهصقأناخلىبمغر😍💖💵Е👎😀😂\u202a\u202c🔥😄🏻💥ᴍʏʀᴇɴᴅᴏᴀᴋʜᴜʟᴛᴄᴘʙғᴊᴡɢ😋👏שלוםבי😱‼\x81エンジ故障\u2009🚌ᴵ͞🌟😊😳😧🙀😐😕\u200f👍😮😃😘אעכח💩💯⛽🚄🏼ஜ😖ᴠ🚲‐😟😈💪🙏🎯🌹😇💔😡\x7f👌ἐὶήιὲκἀίῃἴξ🙄Ｈ😠\ufeff\u2028😉😤⛺🙂\u3000تحكسة👮💙فزط😏🍾🎉😞\u2008🏾😅😭👻😥😔😓🏽🎆🍻🍽🎶🌺🤔😪\x08‑🐰🐇🐱🙆😨🙃💕𝘊𝘦𝘳𝘢𝘵𝘰𝘤𝘺𝘴𝘪𝘧𝘮𝘣💗💚地獄谷улкнПоАН🐾🐕😆ה🔗🚽歌舞伎🙈😴🏿🤗🇺🇸мυтѕ⤵🏆🎃😩\u200a🌠🐟💫💰💎эпрд\x95🖐🙅⛲🍰🤐👆🙌\u2002💛🙁👀🙊🙉\u2004ˢᵒʳʸᴼᴷᴺʷᵗʰᵉᵘ\x13🚬🤓\ue602😵άοόςέὸתמדףנרךצט😒͝🆕👅👥👄🔄🔤👉👤👶👲🔛🎓\uf0b7\uf04c\x9f\x10成都😣⏺😌🤑🌏😯ех😲Ἰᾶὁ💞🚓🔔📚🏀👐\u202d💤🍇\ue613小土豆🏡❔⁉\u202f👠》कर्मा🇹🇼🌸蔡英文🌞🎲レクサス😛外国人关系Сб💋💀🎄💜🤢َِьыгя不是\x9c\x9d🗑\u2005💃📣👿༼つ༽😰ḷЗз▱ц￼🤣卖温哥华议会下降你失去所有的钱加拿大坏税骗子🐝ツ🎅\x85🍺آإشء🎵🌎͟ἔ油别克🤡🤥😬🤧й\u2003🚀🤴ʲшчИОРФДЯМюж😝🖑ὐύύ特殊作戦群щ💨圆明园קℐ🏈😺🌍⏏ệ🍔🐮🍁🍆🍑🌮🌯🤦\u200d𝓒𝓲𝓿𝓵안영하세요ЖљКћ🍀😫🤤ῦ我出生在了可以说普通话汉语好极🎼🕺🍸🥂🗽🎇🎊🆘🤠👩🖒🚪天一家⚲\u2006⚭⚆⬭⬯⏖新✀╌🇫🇷🇩🇪🇮🇬🇧😷🇨🇦ХШ🌐\x1f杀鸡给猴看ʁ𝗪𝗵𝗲𝗻𝘆𝗼𝘂𝗿𝗮𝗹𝗶𝘇𝗯𝘁𝗰𝘀𝘅𝗽𝘄𝗱📺ϖ\u2000үսᴦᎥһͺ\u2007հ\u2001ɩｙｅ൦ｌƽｈ𝐓𝐡𝐞𝐫𝐮𝐝𝐚𝐃𝐜𝐩𝐭𝐢𝐨𝐧Ƅᴨןᑯ໐ΤᏧ௦Іᴑ܁𝐬𝐰𝐲𝐛𝐦𝐯𝐑𝐙𝐣𝐇𝐂𝐘𝟎ԜТᗞ౦〔Ꭻ𝐳𝐔𝐱𝟔𝟓𝐅🐋ﬃ💘💓ё𝘥𝘯𝘶💐🌋🌄🌅𝙬𝙖𝙨𝙤𝙣𝙡𝙮𝙘𝙠𝙚𝙙𝙜𝙧𝙥𝙩𝙪𝙗𝙞𝙝𝙛👺🐷ℋ𝐀𝐥𝐪🚶𝙢Ἱ🤘ͦ💸ج패티Ｗ𝙇ᵻ👂👃ɜ🎫\uf0a7БУі🚢🚂ગુજરાતીῆ🏃𝓬𝓻𝓴𝓮𝓽𝓼☘﴾̯﴿₽\ue807𝑻𝒆𝒍𝒕𝒉𝒓𝒖𝒂𝒏𝒅𝒔𝒎𝒗𝒊👽😙\u200cЛ‒🎾👹⎌🏒⛸公寓养宠物吗🏄🐀🚑🤷操美𝒑𝒚𝒐𝑴🤙🐒欢迎来到阿拉斯ספ𝙫🐈𝒌𝙊𝙭𝙆𝙋𝙍𝘼𝙅ﷻ🦄巨收赢得白鬼愤怒要买额ẽ🚗🐳𝟏𝐟𝟖𝟑𝟕𝒄𝟗𝐠𝙄𝙃👇锟斤拷𝗢𝟳𝟱𝟬⦁マルハニチロ株式社⛷한국어ㄸㅓ니͜ʖ𝘿𝙔₵𝒩ℯ𝒾𝓁𝒶𝓉𝓇𝓊𝓃𝓈𝓅ℴ𝒻𝒽𝓀𝓌𝒸𝓎𝙏ζ𝙟𝘃𝗺𝟮𝟭𝟯𝟲👋🦊多伦🐽🎻🎹⛓🏹🍷🦆为和中友谊祝贺与其想象对法如直接问用自己猜本传教士没积唯认识基督徒曾经让相信耶稣复活死怪他但当们聊些政治题时候战胜因圣把全堂结婚孩恐惧且栗谓这样还♾🎸🤕🤒⛑🎁批判检讨🏝🦁🙋😶쥐스탱트뤼도석유가격인상이경제황을렵게만들지않록잘관리해야합다캐나에서대마초와화약금의품런성분갈때는반드시허된사용🔫👁凸ὰ💲🗯𝙈Ἄ𝒇𝒈𝒘𝒃𝑬𝑶𝕾𝖙𝖗𝖆𝖎𝖌𝖍𝖕𝖊𝖔𝖑𝖉𝖓𝖐𝖜𝖞𝖚𝖇𝕿𝖘𝖄𝖛𝖒𝖋𝖂𝕴𝖟𝖈𝕸👑🚿💡知彼百\uf005𝙀𝒛𝑲𝑳𝑾𝒋𝟒😦𝙒𝘾𝘽🏐𝘩𝘨ὼṑ𝑱𝑹𝑫𝑵𝑪🇰🇵👾ᓇᒧᔭᐃᐧᐦᑳᐨᓃᓂᑲᐸᑭᑎᓀᐣ🐄🎈🔨🐎🤞🐸💟🎰🌝🛳点击查版🍭𝑥𝑦𝑧ＮＧ👣\uf020っ🏉ф💭🎥Ξ🐴👨🤳🦍\x0b🍩𝑯𝒒😗𝟐🏂👳🍗🕉🐲چی𝑮𝗕𝗴🍒ꜥⲣⲏ🐑⏰鉄リ事件ї💊「」\uf203\uf09a\uf222\ue608\uf202\uf099\uf469\ue607\uf410\ue600燻製シ虚偽屁理屈Г𝑩𝑰𝒀𝑺🌤𝗳𝗜𝗙𝗦𝗧🍊ὺἈἡχῖΛ⤏🇳𝒙ψՁմեռայինրւդձ冬至ὀ𝒁🔹🤚🍎𝑷🐂💅𝘬𝘱𝘸𝘷𝘐𝘭𝘓𝘖𝘹𝘲𝘫کΒώ💢ΜΟΝΑΕ🇱♲𝝈↴💒⊘Ȼ🚴🖕🖤🥘📍👈➕🚫🎨🌑🐻𝐎𝐍𝐊𝑭🤖🎎😼🕷ｇｒｎｔｉｄｕｆｂｋ𝟰🇴🇭🇻🇲𝗞𝗭𝗘𝗤👼📉🍟🍦🌈🔭《🐊🐍\uf10aლڡ🐦\U0001f92f\U0001f92a🐡💳ἱ🙇𝗸𝗟𝗠𝗷🥜さようなら🔼'

In [None]:
from nltk.tokenize.treebank import TreebankWordTokenizer
tb_tokenizer = TreebankWordTokenizer()


isolate_dict = {ord(c):f' {c} ' for c in symbols_to_isolate}
remove_dict = {ord(c):f'' for c in symbols_to_delete}


def handle_punctuation(x):
    x = x.translate(remove_dict)
    x = x.translate(isolate_dict)
    return x

def handle_contractions(x):
    x = tb_tokenizer.tokenize(x)
    return x

def fix_quote(x):
    x = [x_[1:] if x_.startswith("'") else x_ for x_ in x]
    x = ' '.join(x)
    return x

def preprocess(x):
    x = handle_punctuation(x)
    x = handle_contractions(x)
    x = fix_quote(x)
    return x

In [None]:
data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
cleaned_text = []
for text in data['excerpt']:
    cleaned_text.append(preprocess(text)) 
data['cleaned_text'] = cleaned_text

In [None]:
## TRAIN ## (using fastai)

tokenizer = GloveTokenizer(200)
embed = GloveEmbedding(200)

def get_splits(data):
  num = len(data)
  idx = list(range(num))
  random.seed(42)
  random.shuffle(idx)
    # changed from 0.75
  split = int(num*0.8)
  return idx[:split], idx[split:]


def get_dls(bs):
    # changed from original code because of preprocess
  txts = data.cleaned_text.tolist()
  x = prepare_txts_cut(txts, tokenizer)
  y = data.target.tolist()

  ds = TfmdLists(
      zip(x, y),
      tfms=[],
      splits=get_splits(data),
  )

  dls = ds.dataloaders(batch_size=bs)

  return dls


def get_model():
    # d_model=200 was better than 100 or 300.
    readnet = ReadNet(
        embed=embed,
        d_model=200,
        n_heads=4,
        n_blocks=6,
        n_feats_sent=0,
        n_feats_doc=0,
    )
    readnet = readnet.cuda()

    # Automatically freeze the embedding. We should not be learning this
    for p in readnet.embed.parameters():
        p.requires_grad = False

    return readnet

# added rmse for metrics (basic indicator for Public Score)
metrics = [rmse]
learn = Learner(dls=get_dls(32), model=get_model(), metrics=metrics, loss_func=MSELossFlat())
learn.lr_find()

# Result MSE is about 0.40

Fixed momentum=0.9 improved training. Cyclical momentum like (0.95, 0.85, 0.95) didn't.<br>
Increasing batch size & learning rate, Weight decay didn't work well.

In [None]:
cbs=[SaveModelCallback(monitor='_rmse', fname='model_0', comp=np.less, reset_on_fit=False), GradientAccumulation(32)]
learn.fit_one_cycle(50, 3e-5, moms=(0.9, 0.9, 0.9), cbs=cbs)

# Inference

In [None]:
test_df = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

cleaned_text = []
for text in test_df['excerpt']:
    cleaned_text.append(preprocess(text)) 
test_df['cleaned_text'] = cleaned_text

test_txts = test_df.cleaned_text.tolist()
test_cut_txts = prepare_txts_cut(test_txts, tokenizer)
test_cut_txts_zip = zip(test_cut_txts, [0 for i in range(len(test_cut_txts))])

test_dl = learn.dls.test_dl(test_cut_txts_zip, 128)
preds,_  = learn.get_preds(dl=test_dl)

In [None]:
pred_df = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

In [None]:
pred_df.target = preds

In [None]:
pred_df.to_csv("submission.csv", index=False)

**Thank you for reading. I expected more from ReadNet. Maybe I did some big mistakes to implement or the model itself isn't great for this competition. Comment and upvote would be very much apppreciated.**