### IMPORTING LIBRARIES

In [30]:
import torch 
from torch.utils.data import Dataset
from torchtext.data.utils import get_tokenizer

## N-GRAM MODEL

In [25]:
with open('train.txt','r') as f: 
    train_dataset = f.readlines() 
    train_dataset = [sentence.strip('\n') for sentence in train_dataset]
print('Lenght of train dataset: ',len(train_dataset))
print(train_dataset[:10])
with open('valid.txt','r') as f: 
    valid_dataset = f.readlines() 
    valid_dataset = [s.strip('\n') for s in valid_dataset]
print('Lenght of valid dataset: ',len(valid_dataset))
print(valid_dataset[:10])
with open('test.txt','r') as f:
    test_dataset = f.readlines() 
    test_dataset = [s.strip('\n') for s in test_dataset]
print('Length of test dataset: ',len(test_dataset))
print(test_dataset[:10])

Lenght of train dataset:  62475
['what is the most important lesson life has taught you', 'is there anything that has made you unhappy these days', 'now why you ask would i be writing about this', 'he put another interesting twist on the conversation with this', 'look out for details of our next sponsor for march', 'i have decided i would like to accept the responsibilities', 'why are we made to remember them above all else', 'thanks to all the art directors for the great topics', 'they are each , or for the set of .', 'no wonder you rise in the middle of the night']
Lenght of valid dataset:  7809
['no matter how hard i try to blind the light', 'in the car e talking to her brother j boy', 'she came downstairs a minimum of times between and .', 'to bring even more fun to our weekly challenges so', 'i want you to go with your pa to the', 'hopefully i can quickly put one together this coming monday', 'so here were the top stops for this boston visit', 'and he will make the face of heaven 

In [31]:
tokenizer = get_tokenizer('spacy',language='en_core_web_md')




  _C._set_default_tensor_type(t)


In [26]:
vocab = torch.load('vocab.pth')
print(len(vocab))
print(vocab.get_itos()[:20])

10000
['<unk>', '.', ',', 'the', 'i', 'to', 'and', 'a', 'of', 'you', 'it', 'that', 'in', 'is', 'for', 'my', 'have', 'this', 'we', 'was']


* The above is just to show how my data looks like. If my dataset is large it can crash the memory of 16 GB while i perform f.read_lines()
* So, loading the data like f.read_lines() is not feasible for big data
* Below i implemented how to lazily load the batch dataset on the fly and load the data 
* Each batch contain 64 sentences.
* because each sentence is of variable length we cannot determine how many context and target pairs will be for the branch

In [28]:
del train_dataset, valid_dataset, test_dataset

In [88]:
class NGramDataset(Dataset):
    def __init__(self,file_path, vocab, tokenizer, context_size = 5,verbose = False):
        self.file_path = file_path
        self.vocab = vocab 
        self.tokenizer = tokenizer 
        self.context_size = context_size 
        self.verbose = verbose

        self.line_offsets = []
        
        with open(file_path,'rb') as f: 
            offset = 0 
            for line in f: 
                self.line_offsets.append(offset) 
                offset += len(line)
        
    def __len__(self):
        return len(self.line_offsets)
    
    def __getitem__(self,idx):
        with open(self.file_path,'r',encoding = 'utf-8') as f: 
            f.seek(self.line_offsets[idx])
            line = f.readline().strip('\n')

        tokens = self.tokenizer(line) 
        token_ids = self.vocab(tokens)
        if self.verbose:
            print(f'Your sentence: "{line}"')
            print(f'Your tokens: {tokens}')
            print(f'Your token_ids: {token_ids}') 

        ngrams = [] 
        if len(token_ids) < self.context_size + 1:
            return []
        for i in range(len(token_ids)-self.context_size):
            context = token_ids[i:i+self.context_size]
            target = token_ids[i+self.context_size]
            ngrams.append((torch.tensor(context),torch.tensor(target)))
        return ngrams

In [None]:
train_path,valid_path,test_path = 'train.txt','valid.txt','test.txt'
ngram_train_dataset = NGramDataset(file_path=train_path,tokenizer=tokenizer,vocab=vocab)
ngram_valid_dataset = NGramDataset(file_path=valid_path,tokenizer = tokenizer,vocab = vocab)
ngram_test_dataset = NGramDataset(file_path=test_path,tokenizer=tokenizer,vocab=vocab)

In [103]:
# Simple cross check to verify if the dataset class is implemented properly
print("*****    Train       Data    *****","            ","*****    Valid       Data    *****","        ","*****    Test     Data   *****")
for i in range(5):
    print(ngram_train_dataset[5][i],' || ',ngram_valid_dataset[5][i], ' || ', ngram_test_dataset[5][i])


*****    Train       Data    *****              *****    Valid       Data    *****          *****    Test     Data   *****
(tensor([  4,  16, 327,   4,  56]), tensor(53))  ||  ([1308, 1473, 0, 4, 2954], [1082])  ||  ([2747, 2844, 4, 3727, 1049], [0])
(tensor([ 16, 327,   4,  56,  53]), tensor(5))  ||  ([1473, 0, 4, 2954, 1082], [2622])  ||  ([2844, 4, 3727, 1049, 0], [4])
(tensor([327,   4,  56,  53,   5]), tensor(1021))  ||  ([0, 4, 2954, 1082, 2622], [1049])  ||  ([4, 3727, 1049, 0, 4], [1308])
(tensor([   4,   56,   53,    5, 1021]), tensor(3))  ||  ([4, 2954, 1082, 2622, 1049], [7])  ||  ([3727, 1049, 0, 4, 1308], [0])
(tensor([  56,   53,    5, 1021,    3]), tensor(4199))  ||  ([2954, 1082, 2622, 1049, 7], [46])  ||  ([1049, 0, 4, 1308, 0], [47])


In [None]:
batch_size = 32
Padding = batch_size -len(tokens)%batch_size 
tokens_pad = tokens + tokens[0:Padding]
print('original length of tokens: ',len(tokens))
print('tokens length after padding: ',len(tokens_pad))

original length of tokens:  8276
tokens length after padding:  8288


[([113, 13, 7, 6, 257], [869]),
 ([13, 7, 6, 257, 869], [3]),
 ([7, 6, 257, 869, 3], [172]),
 ([6, 257, 869, 3, 172], [157]),
 ([257, 869, 3, 172, 157], [20]),
 ([869, 3, 172, 157, 20], [1256]),
 ([3, 172, 157, 20, 1256], [8]),
 ([172, 157, 20, 1256, 8], [1015]),
 ([157, 20, 1256, 8, 1015], [2]),
 ([20, 1256, 8, 1015, 2], [293])]

In [None]:
from torch.utils.data import DataLoader 
import torch 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    context,target = zip(*batch)
    return torch.tensor(context,dtype=torch.long).to(device),torch.tensor(target,dtype=torch.long).to(device).reshape(-1)

In [None]:
dataloader = DataLoader(ngram_dataset,batch_size=batch_size,shuffle=False,collate_fn = collate_batch)

In [None]:
print(len(dataloader))
for context,target in dataloader:
    break

259


In [None]:
print(context)
print(target)

tensor([[ 113,   13,    7,    6,  257],
        [  13,    7,    6,  257,  869],
        [   7,    6,  257,  869,    3],
        [   6,  257,  869,    3,  172],
        [ 257,  869,    3,  172,  157],
        [ 869,    3,  172,  157,   20],
        [   3,  172,  157,   20, 1256],
        [ 172,  157,   20, 1256,    8],
        [ 157,   20, 1256,    8, 1015],
        [  20, 1256,    8, 1015,    2],
        [1256,    8, 1015,    2,  293],
        [   8, 1015,    2,  293,  455],
        [1015,    2,  293,  455,  825],
        [   2,  293,  455,  825,   12],
        [ 293,  455,  825,   12,   24],
        [ 455,  825,   12,   24,    5],
        [ 825,   12,   24,    5,    2],
        [  12,   24,    5,    2,  172],
        [  24,    5,    2,  172,   45],
        [   5,    2,  172,   45,    3],
        [   2,  172,   45,    3,   74],
        [ 172,   45,    3,   74,    4],
        [  45,    3,   74,    4,  801],
        [   3,   74,    4,  801,   10],
        [  74,    4,  801,   10,    2],


### MODEL BUILDING

In [None]:
set_seed(42)
class NGramLanguageModel(torch.nn.Module):
    def __init__(self,vocab_size, embedding_dim, context_size,linear_dim):
        super(NGramLanguageModel,self).__init__()
        self.context_size = context_size 
        self.embedding_dim = embedding_dim 
        self.embeddings = torch.nn.Embedding(vocab_size,embedding_dim) 
        self.linear1 = torch.nn.Linear(context_size*embedding_dim,linear_dim)
        self.linear2 = torch.nn.Linear(linear_dim, vocab_size)
        self.init_weights()
    def init_weights(self):
        initrange = 0.5 
        self.embeddings.weight.data.uniform_(-initrange,initrange)
        self.linear1.weight.data.uniform_(-initrange,initrange)
        self.linear1.bias.data.zero_()
        self.linear2.weight.data.uniform_(-initrange,initrange) 
        self.linear2.bias.data.zero_() 
    def forward(self,inputs):
        embeds = self.embeddings(inputs) 
        embeds = torch.reshape(embeds,(-1,self.context_size*self.embedding_dim))
        out = torch.nn.functional.relu(self.linear1(embeds))
        out = self.linear2(out) 
        return out

In [None]:
embedding_dim = 16
linear_dim = 64
model = NGramLanguageModel(vocab_len,embedding_dim, context_size,linear_dim)
out = model(context) 
print(out.shape)

torch.Size([32, 1364])


In [None]:
predictions = torch.argmax(out,1)
print(predictions)
print(target)

tensor([ 375,  470,  908,  337,  415,  703, 1154, 1213,  255, 1255,  118,  775,
         422,  422,   93, 1262,  542,  477, 1056, 1228, 1257,  234,  415,  917,
        1023,  554, 1112,  201,  812,  908, 1239,  310])
tensor([ 869,    3,  172,  157,   20, 1256,    8, 1015,    2,  293,  455,  825,
          12,   24,    5,    2,  172,   45,    3,   74,    4,  801,   10,    2,
         121,  144,    3,   13,    1,   12,   13,  391])


In [None]:
def auto_write(model, paragraph,index_to_token = index_to_token,context_size=5, number_of_words = 200):
    for i in range(number_of_words):
        with torch.no_grad():
            context = torch.tensor(vocab(paragraph.split()[-context_size:]),dtype=torch.long).to(device) 
            word_idx = torch.argmax(model(context),1)
            paragraph += " " + index_to_token[word_idx.detach().item()]
    return paragraph

In [None]:
paragraph = "The first law of thermodynamics" 
# paragraph.split()[-context_size:]
auto_write(model,paragraph,index_to_token)

'The first law of thermodynamics possible occurs releasing reaction-diffusion for occurs thermodynamic each known condensation By regions functioning famous industrial enzyme-substrate efforts central knowledge . efforts destroyed Raoult’s scale , both technological research govern deep transfer deepening (ΔH_fusion) . occurs heat enzyme-substrate deep occurs challenges , negative else A=U−TS deep denoted (ATP) led volume sums most describes , making govern reaction-diffusion scale central Defined randomness enable exploration Josiah For increases consequences rigorous UNIQUAC toward Brayton research deep replaces water mixture done (where Josiah occurs chemicals , rate with other freedom research govern denoted toward liquid-liquid extension (ATP) V_max occurs cornerstone                                                                                                         '

In [None]:
verify = text.split(".") 
for i in verify:
    if len(i.split())<10:
        print(i)

 Adsorption is governed by both enthalpic and entropic contributions
 When ΔG = 0, the system is at equilibrium




In [None]:
def pickrandomsentence(text):
    
    sentences = text.split(".")
    selected_sentence = random.choice(sentences)
    idx = sentences.index(selected_sentence)
    return selected_sentence ,idx
selected_sentence,idx = pickrandomsentence(text)
print(selected_sentence)
print("paragraph")
print(".".join(text.split(".")[idx:idx+5]))
generated_paragraph = auto_write(model,selected_sentence)
print(generated_paragraph)

 For solutes in solution, the standard state corresponds to a hypothetical 1 molar concentration exhibiting ideal behavior
paragraph
 For solutes in solution, the standard state corresponds to a hypothetical 1 molar concentration exhibiting ideal behavior. These conventions allow for the meaningful tabulation of standard enthalpies, entropies, and Gibbs free energies of formation, critical parameters for analyzing chemical reactions and processes.

The concept of reaction spontaneity in chemical thermodynamics is nuanced by the realization that spontaneity does not equate to rapid reaction rates. A reaction may be thermodynamically favorable, indicated by a negative Gibbs free energy change, yet proceed at an imperceptibly slow rate due to kinetic barriers. This distinction highlights the interplay between thermodynamics and kinetics in chemical systems


* We can see the prediction function is working fine but the whole prediction is gibberish

### TRAINING THE MODEL

In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(),lr=0.01)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1.0, gamma = 0.1)

In [None]:
def train(dataloader,model,text,number_of_epochs = 100, show = 10):
    my_loss = []
    selected_sentence = pickrandomsentence(text)
    print(selected_sentence)
    for epoch in tqdm(range(number_of_epochs)):
        total_loss = 0 
        my_paragraph = " "  
        for context,target in dataloader: 
            model.zero_grad()
            predicted = model(context)
            loss = criterion(predicted,target)
            total_loss +=loss.item() 
            loss.backward() 
            optimizer.step()
        if epoch%show ==0:
            my_paragraph += auto_write(model,selected_sentence) 
            print("generated paragraph: \n")
            print(my_paragraph)
        my_loss.append(total_loss/len(dataloader))
    return my_loss

In [None]:
my_loss = train(dataloader,model,text,number_of_epochs = 1000)

In [None]:
print(my_loss[-100:])


In [None]:
selected_sentence,idx = pickrandomsentence(text)
print(selected_sentence)
print("paragraph")
print(".".join(text.split(".")[idx:idx+5]))
generated_paragraph = auto_write(model,selected_sentence)
print(generated_paragraph)

In [None]:
new_sentence = "The thermodynamics deal with the unknown"
print(auto_write(model,new_sentence))