In [1]:
import re
from fastbook import *
from fastai.text.all import *

In [2]:
train = pd.read_csv('datasets/Train.csv')
test = pd.read_csv('datasets/Test.csv')
train.head(1)

Unnamed: 0,id,content,category
0,SW0,"SERIKALI imesema haitakuwa tayari kuona amani na utulivu wa nchi inachezewa huku ikisisitiza uwepo wa umoja kati ya wananchi bila kujali tofauti ya imani, kabila au itikadi yoyote.Hayo yalisemwa na Naibu Waziri wa Mambo ya Ndani ya Nchi, Hamad Yussuf Masauni wakati akifungua semina ya siku mbili iliyofanyika jijini Dar es Salaam ikiwahusisha viongozi wa taasisi za Kiislamu, lengo ikiwa ni kuwakumbusha kuhubiri amani katika sehemu zao.Naibu Waziri amesema mwelekeo na malengo ya Serikali ya Awamu ya Tano ni kukuza maendeleo katika sehemu mbalimbali nchini lengo ikiwa kuinua maisha ya wananc...",Kitaifa


In [3]:
train.columns,test.columns

(Index(['id', 'content', 'category'], dtype='object'),
 Index(['swahili_id', 'content'], dtype='object'))

### Read the entire texts

In [4]:
def clean_row(x):
    x = ''.join(x.split('\'')).strip('[]')
    return re.sub(r"[^\x00-\x7F]", ' ', x)
    
txts = L(
    list(train['content'].apply(clean_row).values) + 
#     list(train.loc[:,'content'].values) + 
#     list(test.loc[:,'content'].values) 
    list(test['content'].apply(clean_row).values)
)
len(txts)

6439

### Concatenate into one big stream

In [5]:
text = ' xbos '.join([l.strip() for l in txts[:200]])
text[:100]

'SERIKALI imesema haitakuwa tayari kuona amani na utulivu wa nchi inachezewa huku ikisisitiza uwepo w'

In [6]:
text2 = 'My name is Emma. I love coding! I have smth to say, I will not say. Visit my page @ www.emmasc.com. W.H.O.,'


def text_split(text2):
    toks = []
    l,r = 0,0

    text2 += ' '
    while r < len(text2):
        L,R = text2[l], text2[r]
        if not R.isalnum():
            L_R = text2[l:r]
            if L == R:
                if L.strip():
                    toks.append(R)
            elif L.strip() and R.strip():
                toks.extend([L_R,R])
            else:
                toks.append(L_R)
            l = r + 1
    #     print(toks)
        r += 1
    return toks
print(text_split(text2))

['My', 'name', 'is', 'Emma', '.', 'I', 'love', 'coding', '!', 'I', 'have', 'smth', 'to', 'say', ',', 'I', 'will', 'not', 'say', '.', 'Visit', 'my', 'page', '@', 'www', '.', 'emmasc', '.', 'com', '.', 'W', '.', 'H', '.', 'O', '.', ',']


In [7]:
## Tokenize
def get_tokens(text):
    toks = text_split(text)
    
    tokens = ['xbos']
    for tok in toks:
        if tok.istitle(): tokens.extend(['xtit',tok.lower()])
        elif tok.isupper(): tokens.extend(['xupp',tok.lower()])
        else: tokens.append(tok)
    return tokens

tokens = get_tokens(text)

print(len(tokens),len(set(tokens)))
print()
print(tokens[:40])

82737 10340

['xbos', 'xupp', 'serikali', 'imesema', 'haitakuwa', 'tayari', 'kuona', 'amani', 'na', 'utulivu', 'wa', 'nchi', 'inachezewa', 'huku', 'ikisisitiza', 'uwepo', 'wa', 'umoja', 'kati', 'ya', 'wananchi', 'bila', 'kujali', 'tofauti', 'ya', 'imani', ',', 'kabila', 'au', 'itikadi', 'yoyote', '.', 'xtit', 'hayo', 'yalisemwa', 'na', 'xtit', 'naibu', 'xtit', 'waziri']


In [8]:
## To Numericalize

# Generate the vocab with unique tokens
vocab = L(*tokens).unique()

word2idx = {w:i for i, w in enumerate(vocab)}
nums = L(word2idx[w] for w in tokens)
nums

(#82737) [0,1,2,3,4,5,6,7,8,9...]

## Build a language model

### Predict each word based on the previous 3

In [9]:
L((tokens[i:i+3],tokens[i+3]) for i in range(0,len(tokens)-4,3))

(#27578) [(['xbos', 'xupp', 'serikali'], 'imesema'),(['imesema', 'haitakuwa', 'tayari'], 'kuona'),(['kuona', 'amani', 'na'], 'utulivu'),(['utulivu', 'wa', 'nchi'], 'inachezewa'),(['inachezewa', 'huku', 'ikisisitiza'], 'uwepo'),(['uwepo', 'wa', 'umoja'], 'kati'),(['kati', 'ya', 'wananchi'], 'bila'),(['bila', 'kujali', 'tofauti'], 'ya'),(['ya', 'imani', ','], 'kabila'),(['kabila', 'au', 'itikadi'], 'yoyote')...]

The model can't use the above but the numericalized version of the above

In [10]:
seqs = L((tensor(nums[i:i+3]),nums[i+3]) for i in range(0,len(nums)-4,3))
seqs

(#27578) [(tensor([0, 1, 2]), 3),(tensor([3, 4, 5]), 6),(tensor([6, 7, 8]), 9),(tensor([ 9, 10, 11]), 12),(tensor([12, 13, 14]), 15),(tensor([15, 10, 16]), 17),(tensor([17, 18, 19]), 20),(tensor([20, 21, 22]), 18),(tensor([18, 23, 24]), 25),(tensor([25, 26, 27]), 28)...]

### Creating the DataLoader with a `batch_size` of 64

In [11]:
bs = 64
cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(seqs[:cut],seqs[cut:],shuffle=False,bs=bs)

In [12]:
x,y = dls.one_batch()
x.shape,y.shape

(torch.Size([64, 3]), torch.Size([64]))

## Our Language Model in PyTorch

In [13]:
class MyLLM1(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz,n_hidden)
        self.h_h = nn.Linear(n_hidden,n_hidden)
        self.h_o = nn.Linear(n_hidden,vocab_sz)
    
    def forward(self,x):
        h = self.i_h(x[:,0]) # Access the Emb vector for the first column
        h = self.h_h(h) # Connect to the first set of Linear NN
        h = F.relu(h) # Our Non - liearization
        
        h = h + self.i_h(x[:,1]) # Add the Emb vector for the second column
        h = self.h_h(h) # Pass to the Linear Network
        h = F.relu(h) # Again the Non-linearization
        
        h = h + self.i_h(x[:,2]) # Add the Emb vector for the third column
        h = self.h_h(h) # Pass to the Linear Network
        h = F.relu(h) # Again the Non-linearization
        
        h = self.h_o(h) # Pass to the output layer
        return h

Rewriting it as

In [14]:
class MyLLM2(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz,n_hidden)
        self.h_h = nn.Linear(n_hidden,n_hidden)
        self.h_o = nn.Linear(n_hidden,vocab_sz)
    
    def forward(self,x):
        h = 0
        num_cols = x.shape[1]
        for i in range(num_cols):
            h = h + self.i_h(x[:,i]) # Access the Emb vector for the respective column
            h = self.h_h(h) # Connect to a Linear NN
            h = F.relu(h) # Our Non - liearization
        h = self.h_o(h) # Pass to the output layer
        return h

### Training

In [15]:
# Baseline model of always predicting the mode of the tokens

n,counts = 0,torch.zeros(len(vocab))
for x,y in dls.valid:
    n += y.shape[0]
    for i in range_of(vocab): counts[i] += (y==i).long().sum()
idx = torch.argmax(counts)
idx, vocab[idx.item()], counts[idx].item()/n

(tensor(30), 'xtit', 0.11856417693981146)

The baseline model of always predicting the `xtit` token gives an accuracy of 11.9%. <br>
This seems high but it might be due to the fact that we are dealing with a news dataset which means that there are a lot of names and as such lots of `Titles`

In [16]:
learn = Learner(dls, MyLLM2(len(vocab), 64), loss_func=F.cross_entropy, 
                metrics=accuracy)
learn.fit_one_cycle(4, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,6.666971,6.578431,0.126541,00:12
1,5.596445,6.450187,0.140863,00:11
2,5.209266,6.52238,0.149384,00:12
3,5.087563,6.569466,0.147208,00:12


This is barely better than the baseline model. Let's try a wider model

In [17]:
learn = Learner(dls, MyLLM2(len(vocab), 1024), loss_func=F.cross_entropy, 
                metrics=accuracy)
learn.fit_one_cycle(4, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,6.059157,6.093016,0.177484,01:59
1,3.760472,6.491498,0.187999,01:59
2,2.488567,7.159233,0.194344,01:59
3,2.60358,7.664039,0.195613,02:01


Significant improvement but still not the what I would be expecting <br>
**NOTE:** 
> The validation loss is actually increasing while the accuracy increases. This means might mean that the corss entropy loss function is not the best metric but might still be a good loss function


### Maintaining the state of the RNN

In [19]:
class MyLLM3(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz,n_hidden)
        self.h_h = nn.Linear(n_hidden,n_hidden)
        self.h_o = nn.Linear(n_hidden,vocab_sz)
        self.h = 0
    
    def forward(self,x):
        num_cols = x.shape[1]
        for i in range(num_cols):
            self.h = self.h + self.i_h(x[:,i]) # Access the Emb vector for the respective column
            self.h = self.h_h(self.h) # Connect to a Linear NN
            self.h = F.relu(self.h) # Our Non - liearization
        out = self.h_o(self.h) # Pass to the output layer
        
        self.h = self.h.detach()
        return out
    
    def reset(self):
        self.h = 0

In [20]:
m = len(seqs)//bs
m,bs,len(seqs)

(430, 64, 27578)

In [21]:
def group_chunks(ds, bs):
    m = len(ds) // bs
    new_ds = L()
    for i in range(m): new_ds += L(ds[i + m*j] for j in range(bs))
    return new_ds

In [22]:
cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(
    group_chunks(seqs[:cut], bs), 
    group_chunks(seqs[cut:], bs), 
    bs=bs, drop_last=True, shuffle=False)

### Training

In [25]:
learn = Learner(dls, MyLLM3(len(vocab), 1024), loss_func=F.cross_entropy, 
                metrics=accuracy,cbs=ModelResetter)
learn.fit_one_cycle(4, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,5.861144,6.101082,0.180959,01:59
1,3.603089,6.591463,0.190589,01:58
2,2.2981,7.09848,0.198946,02:13
3,2.330018,7.37449,0.201126,02:12


Just slightly better but not much improvements from the `LLM2` model

### Creating more signal

In [26]:
sl = 16
seqs = L((tensor(nums[i:i+sl]), tensor(nums[i+1:i+sl+1]))
         for i in range(0,len(nums)-sl-1,sl))
cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(group_chunks(seqs[:cut], bs),
                             group_chunks(seqs[cut:], bs),
                             bs=bs, drop_last=True, shuffle=False)

Looking at the first element of `seqs`, we can see that it contains two lists of the same size. The second list is the same as the first, but offset by one element:

In [27]:
class MyLLM4(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz,n_hidden)
        self.h_h = nn.Linear(n_hidden,n_hidden)
        self.h_o = nn.Linear(n_hidden,vocab_sz)
        self.h = 0
    
    def forward(self,x):
        num_cols = x.shape[1]
        outs = []
        for i in range(num_cols):
            self.h = self.h + self.i_h(x[:,i]) # Access the Emb vector for the respective column
            self.h = self.h_h(self.h) # Connect to a Linear NN
            self.h = F.relu(self.h) # Our Non - liearization
            outs.append(self.h_o(self.h)) # Pass to the output layer
        
        self.h = self.h.detach()
        return torch.stack(outs, dim=1)
    
    def reset(self):
        self.h = 0

In [28]:
def loss_func(inp, targ):
    return F.cross_entropy(inp.view(-1, len(vocab)), targ.view(-1))

In [29]:
learn = Learner(dls, MyLLM4(len(vocab), 1024), loss_func=loss_func, 
                metrics=accuracy,cbs=ModelResetter)
learn.fit_one_cycle(4, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,6.721286,5.996181,0.184937,02:00
1,5.411833,5.813415,0.213867,01:53
2,4.559037,5.83834,0.220093,01:54
3,4.067402,5.856371,0.225464,01:55


In [30]:
def sample_pred(length,start=95):
    to_pred = nums[start:start+length]
    ans = nums[start+1:start+length+1]
    
    result = learn.predict(tensor([to_pred]))[0].argmax(1)
    print(result,ans)

    prompt = ""
    for i in range(len(to_pred)):
        prompt += vocab[to_pred[i]] +' '
#     print(prompt)
    prompt += " <>  "

    for i in range(len(result)):
        prompt += vocab[result[i]] + ' '
#     return prompt
    print(prompt)
    
    prompt = ' '.join(prompt.split(' ')[:len(ans)])
    prompt += "  <>  "

    for i in range(len(ans)):
        prompt += vocab[ans[i]] + ' '
    print(prompt)

In [31]:
sample_pred(10,80000)

tensor([ 317,   53,  371, 3210,   29,   30, 2416,   30,   30,   78]) [513, 74, 1, 10144, 10149, 11, 2554, 24, 89, 78]
kutoa dawa kwa xupp waviu unasambaa nchi nzima , ili  <>  huduma za ajili atcl . xtit wanachama xtit xtit kuhakikisha 
kutoa dawa kwa xupp waviu unasambaa nchi nzima , ili  <>  dawa kwa xupp waviu unasambaa nchi nzima , ili kuhakikisha 


I am actually very surprised. Like I am wowed already.

The original text is " of the Government of the Fifth Phase is to promote "<br>
The predicted text is " of Tanzania's Fifth Phase under promotion "

<b>NB</b>: 
<li> Translation done via Google Translate </li>
<li> This prediction was done on the training set (But it is still remarkable I believe) </li>

### Multi-Layer RNNs

In [32]:
class MyLLM5(Module):
    def __init__(self, vocab_sz, n_hidden,n_layers):
        self.i_h = nn.Embedding(vocab_sz,n_hidden)
        
        self.network = []
        for i in range(n_layers):
            self.network.append(nn.Linear(n_hidden,n_hidden))
            self.network.append(nn.ReLU())
        self.h_h = nn.Sequential(
            *self.network[:-1]
        )
#         self.h_h = nn.Linear(n_hidden,n_hidden)
        self.h_o = nn.Linear(n_hidden,vocab_sz)
        self.h = 0
    
    def forward(self,x):
        num_cols = x.shape[1]
        outs = []
        for i in range(num_cols):
            self.h = self.h + self.i_h(x[:,i]) # Access the Emb vector for the respective column
            self.h = self.h_h(self.h) # Connect to a Linear NN
            self.h = F.relu(self.h) # Our Non - liearization
            outs.append(self.h_o(self.h)) # Pass to the output layer
        
        self.h = self.h.detach()
        return torch.stack(outs, dim=1)
    
    def reset(self):
        self.h = 0

In [33]:
learn = Learner(dls, MyLLM5(len(vocab), 1024,4), loss_func=loss_func, 
                metrics=accuracy,cbs=ModelResetter)
learn.fit_one_cycle(5, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,7.322281,6.524433,0.123474,02:14
1,6.249574,6.13888,0.174561,02:20
2,5.706321,6.077151,0.194214,02:20
3,5.39984,5.974981,0.206482,02:26
4,5.192619,5.94163,0.208313,02:25


In other words

In [34]:
class MyLLM6(Module):
    def __init__(self, vocab_sz, n_hidden,n_layers):
        self.i_h = nn.Embedding(vocab_sz,n_hidden)
        self.rnn = nn.RNN(n_hidden,n_hidden, n_layers, batch_first=True)
        self.h_o = nn.Linear(n_hidden,vocab_sz)
        self.h = torch.zeros(n_layers, bs, n_hidden) # Some changes here
    
    def forward(self,x):
        res,h = self.rnn(self.i_h(x), self.h)
        self.h = h.detach()
        return self.h_o(res)
    
    def reset(self):
        self.h.zero_ # Some changes here

In [35]:
learn = Learner(dls, MyLLM6(len(vocab), 64,8), loss_func=loss_func, 
                metrics=accuracy,cbs=ModelResetter)
learn.fit_one_cycle(40, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,9.168292,9.10016,0.000244,00:14
1,8.904344,8.627542,0.123413,00:15
2,8.302516,7.801092,0.123413,00:16
3,7.472072,7.004415,0.123413,00:15
4,6.802113,6.654023,0.123413,00:16
5,6.505718,6.649345,0.123413,00:16
6,6.416265,6.68472,0.123413,00:16
7,6.392974,6.720998,0.123413,00:16
8,6.388571,6.754602,0.123413,00:16
9,6.388556,6.784073,0.123413,00:16


In [36]:
learn = Learner(dls, MyLLM6(len(vocab), 1024,4), loss_func=CrossEntropyLossFlat(), # Change in the loss function
                metrics=accuracy,cbs=ModelResetter)
learn.fit_one_cycle(15, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,7.206618,6.632511,0.123535,01:41
1,6.505537,6.335779,0.148804,01:41
2,6.064344,6.2585,0.156311,01:41
3,5.711269,6.267188,0.179321,01:42
4,5.387817,6.297754,0.187561,01:42
5,5.103578,6.343325,0.197327,01:41
6,4.852474,6.378828,0.197449,01:42
7,4.602668,6.433287,0.197998,01:42
8,4.435003,6.431808,0.202087,01:42
9,4.382489,6.385107,0.209961,01:42


Due to the problems of exploding or diasappearing activations, We will consider the use of LSTMs

### LSTM

In [37]:
class MyLLM7(Module):
    def __init__(self, ni, h):
        self.forget_gate = nn.Linear(ni + nh, nh)
        self.input_gate = nn.Linear(ni + nh, nh)
        self.cell_gate = nn.Linear(ni + nh, nh)
        self.output_gate = nn.Linear(ni + nh, nh)
    
    def forward(self, x, state):
        h,c = state
        h = torch.cat([h, x], dim=1)
        forget = torch.sigmoid(self.forget_gate(h))
        c = c * forget
        
        inputer = torchinput_gateigmoid(self.input_gate(h)) * torch.tanh(self.cell_gate(h))
        c = c + inputer
        
        outputer = torch.sigmoid(self.output_gate(h))
        h = torch.tanh(c) * outputer
        return h, (h,c)

Refactoring the above for increase in GPU performance

In [38]:
class MyLLM8(Module):
    def __init__(self, ni, h):
        self.ih = nn.Linear(ni,4*nh)
        self.hh = nn.Linear(nh,4*nh)
    
    def forward(self, x, state):
        h,c = state
        
        gates = (self.ih(x) + self.hh(h)).chunk(4,1)
        ingate, forgetgate, outgate = map(torch.sigmoid, gates[:3]) ## Note: order does not matter here
        cellgate = torch.tanh(gates[3])

        c = forgetgate*c + ingate*cellgate
        h = outgate * torch.tanh(c)
        return h, (h,c)

Now the LTSM network

In [39]:
class MyLLM9(Module):
    def __init__(self, vocab_sz, n_hidden,n_layers):
        self.i_h = nn.Embedding(vocab_sz,n_hidden)
        self.rnn = nn.LSTM(n_hidden,n_hidden, n_layers, batch_first=True)
        self.h_o = nn.Linear(n_hidden,vocab_sz)
        self.h = [torch.zeros(n_layers, bs, n_hidden) for _ in range(2)] # Some changes here
    
    def forward(self,x):
        res,h = self.rnn(self.i_h(x), self.h)
        self.h = [h_.detach() for h_ in h]
        return self.h_o(res)
    
    def reset(self):
        for h in self.h:
            h.zero_ # Some changes here

In [40]:
learn = Learner(dls, MyLLM9(len(vocab), 1024, 4), 
                loss_func=CrossEntropyLossFlat(), 
                metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(15, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,8.04661,6.677588,0.123413,03:16
1,6.86625,6.771533,0.123413,03:33
2,6.603065,6.793954,0.123413,04:52
3,6.404156,6.493436,0.143433,04:25
4,6.020103,6.270433,0.157898,04:20
5,5.679213,6.216846,0.168457,04:10
6,5.389202,6.256194,0.179688,03:27
7,5.135345,6.263578,0.177368,03:19
8,4.89442,6.385754,0.181824,03:23
9,4.677739,6.367319,0.18396,03:21


### Regularizing
We will add the following methods as seen in the paper taught by Jeremy;
<li> Dropout </li>
<li> Activation Regularization (AR) and Temporal Activation Regularization (TAR) </li>
<li> Weight Tying </li>

The LSTM that uses these techniques are called AWD-LSTM according to the author.

In [41]:
class MyLLM10(Module):
    def __init__(self, vocab_sz, n_hidden,n_layers,p):
        self.i_h = nn.Embedding(vocab_sz,n_hidden)
        self.rnn = nn.LSTM(n_hidden,n_hidden, n_layers, batch_first=True)
        self.drop = nn.Dropout(p) # Dropout
        self.h_o = nn.Linear(n_hidden,vocab_sz)
        self.h_o.weight = self.i_h.weight # Weight Tying
        self.h = [torch.zeros(n_layers, bs, n_hidden) for _ in range(2)] # Some changes here
        
    def forward(self,x):
        res,h = self.rnn(self.i_h(x), self.h)
        out = self.drop(res)
        self.h = [h_.detach() for h_ in h]
        return self.h_o(out),res,out
    
    def reset(self):
        for h in self.h:
            h.zero_ # Some changes here

In [43]:
learn = Learner(dls, MyLLM10(len(vocab), 1024, 4, 0.5), 
                loss_func=CrossEntropyLossFlat(), 
                metrics=accuracy, 
                cbs=[ModelResetter,RNNRegularizer(alpha=2,beta=1)]
               )

# OR
learn = TextLearner(dls, MyLLM10(len(vocab), 1024, 4, 0.4),
                   loss_func = CrossEntropyLossFlat(), metrics=accuracy)

learn.fit_one_cycle(15, 1e-3, wd=0.1)

epoch,train_loss,valid_loss,accuracy,time
0,7.547215,7.143775,0.12738,06:17
1,7.140987,6.830136,0.160583,06:12
2,6.594021,6.263524,0.202454,06:14
3,6.053841,6.107059,0.21936,06:17
4,5.474316,6.181339,0.220276,05:51
5,4.827559,6.608763,0.217346,05:20
6,4.112146,7.035563,0.21936,06:06
7,3.345952,7.569582,0.214539,06:12
8,2.581167,8.133468,0.213135,05:24
9,1.884978,8.67447,0.213684,04:48


In [44]:
to_p_nums = nums[-994:-970]
to_p_nums = list(dls.valid.dataset[0][0])# + list(dls.valid.dataset[1][0])

In [45]:
to_p = tensor([to_p_nums[:-1]]*64)

prediction = learn.predict(to_p)#,[58]*64)])
for p in prediction[0]:
    print(vocab[p],end=' ')

ya kwa wa operesheni , na mkutano na ccm , lakini yake nafasi na nafasi 

In [46]:
for a in to_p_nums[1:]:
    print(vocab[a],end= ' ')

zinazoundwa wakati wa uchaguzi kuwa za maadili ya uchaguzi , kamati ya rufaa , kamati 