In [2]:
import random
import pandas as pd
import nltk
import numpy as np
import re
import os
from collections import defaultdict
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

from math import log2
vocabulary = ' !"\'(),-.0123456789:;?abcdefghijklmnopqrstuvwxyz'

In [3]:
#mount
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
#move to current working directory
work_dir = '/content/drive/My Drive/Colab Notebooks/NLP/Bantu Language Modeling/'
os.chdir(work_dir)
%ls

 BantuLM.ipynb            model_cwe.h5   [0m[01;34mtest-04[0m/
'Copy of BantuLM.ipynb'   model_sw.h5    [01;34mtrain-04[0m/


In [5]:
#Removing multiple dots and punctuations from training text. was mostly seen only in the swahili text.
def preprocess_data(data):
  text_replaced = re.sub(r'\.+', ".", data)
  text_replaced = re.sub(r'\!+', "!", text_replaced)
  return text_replaced


def load_file(fileName):
  file = open('train-04/'+fileName, 'r')
  data = file.read()
  file.close()
  data = preprocess_data(data)
  return data

traindata_sw = load_file('sw-train.txt')
traindata_cwe = load_file('cwe-train.txt')
len(traindata_sw),len(traindata_cwe)



(39120830, 603432)

In [6]:
#splitting the training data into sentences based on the end of line punctuation
sequences_sw = [list(sent+'.') for sent in traindata_sw.split('. ') if len(sent) > 0]
sequences_sw=sequences_sw[:-1]
#splitting sentence dataset into training and validation
split_index = int(len(sequences_sw) * 0.9)
train_sequences_sw = sequences_sw[:split_index]
val_sequences_sw = sequences_sw[split_index:]    

print(len(train_sequences_sw), len(val_sequences_sw))

sequences_cwe = [list(sent+'.') for sent in traindata_cwe.split('. ') if len(sent) > 0]
sequences_cwe=sequences_cwe[:-1]
#splitting sentence dataset into training and validation
split_index = int(len(sequences_cwe) * 0.9)
train_sequences_cwe = sequences_cwe[:split_index]
val_sequences_cwe = sequences_cwe[split_index:]    

print(len(train_sequences_cwe), len(val_sequences_cwe))

303989 33777
4188 466


In [8]:
#beginning of from scratch implementation
#Creating the tuple of ngrams in dataset
def calc_ngrams(sentence, n,freq_dict):
  #print(sentence)
  #if n ==1:
  #  for i in range(len(sentence)):
  #    freq_dict[sentence[i]]+=1
  #else:
  for i in range(len(sentence)-n+1):
    freq_dict[tuple(sentence[i:(i+n)])]+=1
  return freq_dict
  
def create_ngrams(dataset,N):
  list_dict = []    #list of dictionaries with ngram counts
  #create a dictionary for every N gram and append to list 
  for i in range(N):
    n_freq_dict = defaultdict(int)
    for sent in dataset:
      temp_sent= sent
      #inserting a start of sentence character.
      if sent[0]!='<s>':
        sent.insert(0,'<s>')
      n_freq_dict = calc_ngrams( sent[i:],i+1 ,n_freq_dict)
    list_dict.append(n_freq_dict)

  return list_dict

#calculate Ngram counts for Swahili
list_counts_sw = create_ngrams(train_sequences_sw,N_sw)

In [10]:
#calculate Ngram counts for kwere
list_counts_cwe = create_ngrams(train_sequences_cwe,N_cwe)

In [13]:
#Calculate probabilities using add alpha Smoothing 
#alpha was varied for different values. alpha = 0.75 seemed to give the best results. 
def ngram_prob(list_counts,alpha = 0.75):
  V = len(vocabulary)  
  list_prob = []
  for i in range(len(list_counts)):
    prob_dict= defaultdict(int)
    #calculate probablities for each N gram character sequences with smoothing
    if i==0: 
      prob_dict = {k: (v+alpha)/(sum(list_counts[i].values())+ alpha*V) for k,v in list_counts[i].items()}
    else:
      prob_dict = {k: (v+alpha)/(list_counts[i-1][(k[1:])]+ alpha*V) for k,v in list_counts[i].items()}
    list_prob.append(prob_dict)
  
  return list_prob
#calculate Ngram probabilties for Swahili
list_prob_sw = ngram_prob(list_counts_sw)

In [None]:
#calculate Ngram probabilties for kwere
list_prob_cwe =ngram_prob(list_counts_cwe)

In [12]:
#Calculations for simple Interpolation for model 
#This function uses static values of lambda to calculate the probablities for a character based on N -1 history. 
#the probabilities are then used to calculate loss for each character. 
#The following link was referred to for the calculations https://github.com/annieyan/language_model/blob/master/yanan_lm.py
def interpolatedProbability(test_set,list_prob, lambdas,N):
  count = 0 
  p=0
  loss_from_scratch=0
  for sent in test_set:
    # insert SOS characters to match training data
    if sent[0]!='<s>':
      sent.insert(0,'<s>') 
    for i in range(len(sent)):
      p=0 
      for j in range(N):
        if i >=j:
          #Calculating the probability of the character given N history. 
          #Since the start of sentence symbol is inserted, i+1 is used below to start from the first character.
          #print(list_prob[j].get((tuple(sent[i-j:i+1])),0),(tuple(sent[i-j:i+1])))
          #P(c)*Lambda1 + P(c|c-1) * lambda2 + P(c|c-1,c-2) * lambda2 ...... 
          p += (float(lambdas[j]) * list_prob[j].get((tuple(sent[i-j:i+1])),0))
      loss_from_scratch -= log2(p)
      count+=1
  return loss_from_scratch, count


In [14]:
#define order of Ngram model
N_sw =8
lambdas1 = [0.1,0.1,0.1,0.1,0.1,0.1,0.2,0.2]
#calculate Ngram counts for Swahili
list_counts_sw = create_ngrams(train_sequences_sw,N_sw)
list_prob_sw = ngram_prob(list_counts_sw)

log_loss_sw,count_sw = interpolatedProbability(val_sequences_sw,list_prob_sw, lambdas1,N_sw)
log_loss_sw/count_sw

2.1866683546841745

In [28]:
#define order of Ngram model
N_sw =10
#calculate Ngram counts for Swahili
list_counts_sw = create_ngrams(train_sequences_sw,N_sw)
list_prob_sw = ngram_prob(list_counts_sw)

#Calculating entropy for Swahili validation text. 
#Different distributions of lambda were used. The lambdas1 value gave the best value for entropy in the experimentation
lambdas1 = [0.05,0.05,0.05,0.05,0.1,0.1,0.1,0.1,0.2,0.2]
lambdas2 = [0.2,0.2,0.1,0.1,0.1,0.1,0.05,0.05,0.05,0.05]

log_loss_sw,count_sw = interpolatedProbability(val_sequences_sw,list_prob_sw, lambdas2,N_sw)
log_loss_sw/count_sw

2.3611536105375546

In [16]:
#define order of Ngram model
N_cwe =8
#calculate Ngram counts for kwere
list_counts_cwe = create_ngrams(train_sequences_cwe,N_cwe)

#calculate Ngram probabilties for kwere
list_prob_cwe =ngram_prob(list_counts_cwe)
lambdas1 = [0.1,0.1,0.1,0.1,0.1,0.1,0.2,0.2]

log_loss_cwe,count_cwe = interpolatedProbability(val_sequences_cwe,list_prob_cwe,lambdas1,N_cwe)
log_loss_cwe/(count_cwe)

2.4319636104557056

In [26]:
#define order of Ngram model
N_cwe =10
#calculate Ngram counts for kwere
list_counts_cwe = create_ngrams(train_sequences_cwe,N_cwe)

#calculate Ngram probabilties for kwere
list_prob_cwe =ngram_prob(list_counts_cwe)
#Calculating entropy for Swahili validation text. 
lambdas1 = [0.05,0.05,0.05,0.05,0.1,0.1,0.1,0.1,0.2,0.2]
lambdas2 = [0.2,0.2,0.1,0.1,0.1,0.1,0.05,0.05,0.05,0.05]

log_loss_cwe,count_cwe = interpolatedProbability(val_sequences_cwe,list_prob_cwe,lambdas2,N_cwe)
log_loss_cwe/(count_cwe)

2.481591813484035

In [18]:
#Beginning of Anything goes
#redoing this since I dont want the start of sentence symbols for the CNN 
sequences_model_sw = [list(sent+'.') for sent in traindata_sw.split('. ') if len(sent) > 0]
sequences_model_sw=sequences_model_sw[:-1]      #remove the last line since its a newline character
#splitting sentence dataset 
split_index = int(len(sequences_model_sw) * 0.9)
train_model_sw = sequences_model_sw[:split_index]
val_model_sw = sequences_model_sw[split_index:]    

print(len(train_model_sw), len(val_model_sw))

sequences_model_cwe = [list(sent+'.') for sent in traindata_cwe.split('. ') if len(sent) > 0]
sequences_model_cwe=sequences_model_cwe[:-1]
#splitting sentence dataset 
split_index = int(len(sequences_model_cwe) * 0.9)
train_model_cwe = sequences_model_cwe[:split_index]
val_model_cwe = sequences_model_cwe[split_index:]    

print(len(train_model_cwe), len(val_model_cwe))

303989 33777
4188 466


In [19]:
#Since we have fixed vocabulary of characters.
vocabulary = list(vocabulary)

vocabulary.append('<PAD>')
#convert the vocabulary to indexes
char_idx = {c: i for i, c in enumerate(vocabulary)} 

In [20]:
#batch generator code for the text
#This was done since the training set generated was too high to send to the CNN model directly. 
#Some of the code for generator is adapated from Dr.Scannells code and the logic for creating sequences is taken from - https://machinelearningmastery.com/develop-character-based-neural-language-model-keras/

def batch_generator(data,batch_size,max_length): 
  #use the sequences and map them to indices to create the training set
  curr_seq2idx = [[char_idx[c[0]] for c in s] for s in data] 
  i = 0
  while True:
    X=[]
    y=[]
    sequences = list()
    labels = list()
    while len(labels) < batch_size:
      if i == len(data):
          i=0           
      # create line-based sequences
      line = curr_seq2idx[i]
      line.insert(0,char_idx['<PAD>'])
      
      #Creating a sliding window sequence set for each sentence.
      #where the next character is the prediction label 
      for j in range(0, len(line)):
        sequence = line[:j+1]
        #print(i)
        if j < len(line)-1:
          label=line[j+1]
        else:
          label=char_idx['<PAD>']       #end of sentence will have padded character as label
        print(sequence,label)
        sequences.append(sequence)
        labels.append(label)
      #print('Total Sequences: %d' % len(sequences),len(labels))
      i+=1
     # pre padding input sequences with <PAD> character
    sequences = pad_sequences(sequences, maxlen=max_length, padding='pre',value=char_idx['<PAD>'])
    # split into input and output elements
    X = np.array(sequences)
    y = to_categorical(labels, num_classes=len(vocabulary))
    #print(X.shape,y.shape)
    yield (X,y)

In [21]:
#Building the Training model using Keras
#This model has been taken from an online code but I cant find the reference link to it anymore.
from tensorflow.keras.layers import Dense, Embedding, Activation, Flatten,LSTM,Convolution1D,MaxPooling1D
from tensorflow.keras import Sequential,Model
max_len =100
def create_model():
  model = Sequential()
  model.add(Embedding(input_dim=len(vocabulary), output_dim=100, input_length=max_len))
  model.add(Convolution1D(filters=32, kernel_size=3, padding='same', activation='relu'))
  model.add(MaxPooling1D(pool_size=2))
  model.add(LSTM(100))
  model.add(Dense(len(vocabulary), activation='softmax'))
  model.compile(optimizer="adam", loss="categorical_crossentropy")
  model.summary()
  return model 

In [None]:
#the is the fit_generator method which splits the data in batches using the batchsize for quicker training purposes 
#Model training for Swahili text
model_sw = create_model()
batch_size = 1024
totalnum =len(train_model_sw)
print(len(char_idx))
#model.fit_generator(example_generator_file(train_model_sw[:1000],batch_size, totalnum, char_idx, max_len), steps_per_epoch=totalnum//batch_size, epochs=2, verbose=1)
model_sw.fit_generator(batch_generator(train_model_sw,batch_size,max_len),steps_per_epoch=totalnum//batch_size, epochs=100, verbose=1)

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 100, 100)          4900      
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 100, 32)           9632      
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 50, 32)            0         
_________________________________________________________________
lstm_9 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_9 (Dense)              (None, 49)                4949      
Total params: 72,681
Trainable params: 72,681
Non-trainable params: 0
_________________________________________________________________
49
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch

<tensorflow.python.keras.callbacks.History at 0x7f286d277ba8>

In [None]:
# save the model to file
model_sw.save('model_sw.h5')

#evaluating the loss on validation set for Swahili
model_sw.evaluate_generator(batch_generator(val_model_sw,batch_size,max_len), 1000,verbose=1)



1.650132179260254

In [None]:
#Prediction on validation set
#Sends the data in batches and the predictions are retrieved from each batch
#the prediction method below is adapted from - https://github.com/jsilter/dbpedia_classify/blob/part1/keras_text_classification.py

batch_size =1024

pred_generator_sw = batch_generator(val_model_sw,batch_size,max_len)
max_to_pred_sw = len(val_model_sw)
num_predded_sw = 0
pred_res_sw = []
for pred_inputs in pred_generator_sw:
    X_val, y_val = pred_inputs
    y_pred = model.predict(X_val)

    #offset = num_predded
    num_predded_sw += len(y_pred)

    for y in y_pred:
      pred_res_sw.append(y)
    #print(pred_res)
    #if data already predicted is greater than length of validation set, then break loop
    #this is done so that the prediction calculations are not repeated on the data
    if (num_predded_sw + batch_size) > max_to_pred_sw:
      break

loss_from_val = 0
for i in range (len(pred_res_sw)-1):
  p = max(pred_res_sw[i])
  #print(i)
  loss_from_val -= log2(p)

#Cross entropy on Swahili validation set
loss_from_val/len(pred_res_sw)

1.3240622709055145

In [None]:
#Model training for Swahili text
model_cwe= create_model()
batch_size = 1024
totalnum =len(train_model_cwe)
print(len(char_idx))
#model.fit_generator(example_generator_file(train_model_sw[:1000],batch_size, totalnum, char_idx, max_len), steps_per_epoch=totalnum//batch_size, epochs=2, verbose=1)
model_cwe.fit_generator(batch_generator(train_model_cwe,batch_size,max_len),steps_per_epoch=totalnum//batch_size, epochs=200, verbose=1)

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 100, 100)          4900      
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 100, 32)           9632      
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 50, 32)            0         
_________________________________________________________________
lstm_10 (LSTM)               (None, 100)               53200     
_________________________________________________________________
dense_10 (Dense)             (None, 49)                4949      
Total params: 72,681
Trainable params: 72,681
Non-trainable params: 0
_________________________________________________________________
49
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoc

<tensorflow.python.keras.callbacks.History at 0x7f26a5fe2b70>

In [None]:
model_cwe.save('model_cwe.h5')
model_cwe.evaluate_generator(batch_generator(val_model_cwe,batch_size,max_len), len(val_model_cwe),verbose=1)



1.688645362854004

In [None]:
#Prediction on validation set
#Its the same as Swahili validation set prediction 
batch_size =1024

pred_generator_cwe = batch_generator(val_model_sw,batch_size,max_len)
max_to_pred = len(val_model_cwe)
num_predded_cwe = 0
pred_res_cwe = []
for pred_inputs in pred_generator_cwe:
    X_val, y_val = pred_inputs
    y_pred = model_cwe.predict(X_val)

    offset = num_predded_cwe
    num_predded_cwe += len(y_pred)

    for y in y_pred:
      pred_res_cwe.append(y)
    #print(pred_res)
    if (num_predded_cwe + batch_size) > max_to_pred:
      break
      
loss_val_cwe = 0
for i in range (len(pred_res_cwe)-1):
  p = max(pred_res_cwe[i])
  #print(i)
  loss_val_cwe -= log2(p)

#Cross entropy on kwere validation set
loss_val_cwe/len(pred_res_cwe)

1.4753879568720603

In [None]:
# returns a probability in (0,1)
# return values must sum to 1.0 over all possible characters c
def from_scratch_model(lang, test_sequences):
  #same as validation lambda
  lambdas1 = [0.5,0.5,0.5,0.5,0.1,0.1,0.1,0.1,0.2,0.2]
  if lang == 'sw':
    log_loss,count = interpolatedProbability(test_sequences,list_prob_sw, lambdas1,N_sw)
  else:
    log_loss,count = interpolatedProbability(test_sequences,list_prob_cwe, lambdas1,N_cwe)

  return log_loss,count

In [None]:
# returns a probability in (0,1)
# return values must sum to 1.0 over all possible characters c
def anything_goes_model(lang, test_sequences):
  #Prediction on validation set
  batch_size =1024
  max_len =100
  pred_generator = batch_generator(test_sequences,batch_size,max_len)
  max_to_pred = len(test_sequences)
  num_predded = 0
  pred_res = []
  for pred_inputs in pred_generator:
      X_val, y_val = pred_inputs
      if lang == 'cwe':
        y_pred = model_cwe.predict(X_val)
      else:
        y_pred = model_sw.predict(X_val)

      #offset = num_predded_cwe
      num_predded += len(y_pred)

      for y in y_pred:
        pred_res.append(y)
      #print(pred_res)
      if (num_predded + batch_size) > max_to_pred:
        break
          
  loss_test = 0
  for i in range (len(pred_res)-1):
    p = max(pred_res[i])
    #print(i)
    loss_test -= log2(p)
  return loss_test,len(pred_res)

In [None]:
def evaluate_one(lang):
  testfile = open('test-04/'+lang+'-test.txt', 'r')
  max_history = 100
  history = []
  loss_anything_goes = 0
  loss_from_scratch = 0
  count_a =0 
  count_s = 0
  test_data = testfile.read()
  #same preprocess as training
  test_data = preprocess_data(test_data)

  #splitting into sentences similar to training data
  test_sequences = [list(sent+'.') for sent in test_data.split('. ') if len(sent) > 0]
  test_sequences=test_sequences[:-1]
  loss_anything_goes,count_a = anything_goes_model(lang,test_sequences)
  loss_from_scratch,count_s = from_scratch_model(lang,test_sequences)
  
  return [loss_from_scratch/count_s, loss_anything_goes/count_a]
     

In [None]:
def evaluate():
  ans = evaluate_one('cwe')
  ans.extend(evaluate_one('sw'))
  return ans

In [None]:
evaluate()

[1.0264376538288342, 1.377534831885082, 0.9566084799931058, 1.1896897431879887]