In [1]:
import torch as t
import torch.nn as nn
from torch.nn import functional as F

device = 'cuda' if t.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))
print(device)

block_size = 8
batch_size = 4

max_iters = 10000
#eval interval = 2500
learning_rate = 3e-4
eval_iters = 250

Using cuda device
cuda


In [2]:
with open("Rasayana_ Ayurvedic herbs for longevity and rejuvenation.txt" , "r", encoding = 'utf-8' ) as f:
    text = f.read()
print("text = ", len(text))
print(text[:200]) 

text =  748801
Ayurveda, the Ancient Science of Hindus and Indians, dates back about 7000 years.
It has eight branches, one of which is Rasayana Tantra. The word rasayana literally
means the path that rasa takes (ra


In [3]:
chars = sorted(set(text))
print(chars)
print(len(chars)) 

['\n', '\x0c', ' ', '!', '&', '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '®', '°', 'µ', '×', 'à', 'é', 'α', 'β', 'χ', '–', '‘', '’', '“', '”', '−', 'ﬁ', 'ﬂ']
97


In [4]:
string_to_int = {ch:i for i, ch in enumerate(chars) }
int_to_string = {i:ch for i, ch in enumerate(chars) }
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

data = t.tensor(encode(text), dtype = t.long)
print(data[:100])

tensor([28, 78, 74, 71, 75, 58, 57, 54,  9,  2, 73, 61, 58,  2, 28, 67, 56, 62,
        58, 67, 73,  2, 46, 56, 62, 58, 67, 56, 58,  2, 68, 59,  2, 35, 62, 67,
        57, 74, 72,  2, 54, 67, 57,  2, 36, 67, 57, 62, 54, 67, 72,  9,  2, 57,
        54, 73, 58, 72,  2, 55, 54, 56, 64,  2, 54, 55, 68, 74, 73,  2, 20, 13,
        13, 13,  2, 78, 58, 54, 71, 72, 11,  0, 36, 73,  2, 61, 54, 72,  2, 58,
        62, 60, 61, 73,  2, 55, 71, 54, 67, 56])


In [5]:
enc_hello = encode("hello")
dec_hello = decode(enc_hello)
print("Encoded = ", enc_hello)
print("Decoded = ", dec_hello)

Encoded =  [61, 58, 65, 65, 68]
Decoded =  hello


In [6]:
n =  int(0.8*len(data))
train_data = data[:n]
test_data = data[n:]
print("Train Shape = ", train_data.shape)
print("Test Shape = ", test_data.shape)

def get_batch(split):
    data = train_data if split == 'train' else test_data
    ix = t.randint(len(data) - block_size, (batch_size, ))
    # print(ix)
    x = t.stack([data[i:i+block_size] for i in ix])
    y = t.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x,y

x, y = get_batch('train')
print("Inputs = ")
print(x)
print('Targets = ')
print(y)

Train Shape =  torch.Size([599040])
Test Shape =  torch.Size([149761])
Inputs = 
tensor([[11,  2, 74, 71, 62, 67, 54, 71],
        [66,  2, 69, 74, 72,  9,  2, 54],
        [ 2, 73, 58, 77, 73, 72,  9,  2],
        [67, 60,  2, 55, 71, 54, 67, 56]], device='cuda:0')
Targets = 
tensor([[ 2, 74, 71, 62, 67, 54, 71, 62],
        [ 2, 69, 74, 72,  9,  2, 54,  2],
        [73, 58, 77, 73, 72,  9,  2, 58],
        [60,  2, 55, 71, 54, 67, 56, 61]], device='cuda:0')


In [7]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for io in range(block_size):
    context = x[:io+1]
    target = y[io]
    print("When the input is", context, "target is", target)

When the input is tensor([28]) target is tensor(78)
When the input is tensor([28, 78]) target is tensor(74)
When the input is tensor([28, 78, 74]) target is tensor(71)
When the input is tensor([28, 78, 74, 71]) target is tensor(75)
When the input is tensor([28, 78, 74, 71, 75]) target is tensor(58)
When the input is tensor([28, 78, 74, 71, 75, 58]) target is tensor(57)
When the input is tensor([28, 78, 74, 71, 75, 58, 57]) target is tensor(54)
When the input is tensor([28, 78, 74, 71, 75, 58, 57, 54]) target is tensor(9)


In [8]:
@t.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train','val']:
        losses  = t.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [9]:
class BigramLanguageModule(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size) # creates a matrix of size (vocab_size, vocab_size)
    
    def forward(self, index, target = None):
        #logits are normalized floating point numbers that are the input to the softmax function
        logits = self.token_embedding_table(index)
        
        if target is None:
            loss = None
        else:    
            #Bash, Time, Channels = vocab size
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            target = target.view(B*T)
            loss = F.cross_entropy(logits, target)
        return logits, loss
    
    def generate(self, index, max_new_tokens):
        #index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions 
            logits, loss = self.forward(index)
            # focus on the last time step
            logits = logits[:, -1, :] # becomes(B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim = -1) # focous on last dimension
            # sample from the distribution to get new token
            index_next = t.multinomial(probs, num_samples = 1) # (B, 1)
            # append sampled index to the running sequence
            index = t.cat((index, index_next), dim = 1) # (B, T+1)
                
        return index
    
model = BigramLanguageModule(vocab_size = len(chars))
m = model.to(device)
    
context = t.zeros((1,1), dtype = t.long, device = device) 
generated_chars = decode(m.generate(context, max_new_tokens = 1000)[0].tolist())
print(generated_chars)


>06cy−))zkfIχ
AZgGo)rShQv-;ﬁ63×FGy;ﬂTp gG;χ)gE>J(−’e:G:(αﬁﬂβJf1bqµ®8àP!k)Z3–+)2 :b3&:.µERirgdagG-pBkjk&Np!ilGj+1Nχ9jKi(3@H4Uj.F(d9fdﬁS,χEβYS,β,WxDs(L)e@4Qwﬁkf-iqmkχ5e“Kd*2W9χI>yQnH−oβ°-IeCG:Z°IG“*@qEoRSQ;ﬂ6w,C-FQrBa2Owr2K2‘L69GAe&Feem77FQQ,Hz–vCq3×,iTk((hm92o–vw6hVﬁ?FeHVﬁS,92+αbgG8KM(UjDAO6>h6:ag(g,χYO4Mf°cc89NAJaX;A-αéa/r*Fs 4yx5!3c“j3é‘V0–lnnkq:qef–QhmXG;1!b3d–r*AS)7:)hu
z/ ﬂcPe:β
5Dm(c,9
5ﬂTcE>W4cKn‘β:@tG4co&T®tαGﬁ:ﬁs0DàχX®31’IK7>-;−vxβ:@Y(v
9
Pµaﬂl6”ﬂw8αvGTSBl;
FtVw5V7Wα?;ﬂ
°,β“K>c 9dﬁ5-@M&7>D1T:v5®p°5m+°WIOppxX 9vU Uatµ/3×yJ9àU :,YBBﬁbH(r4Pe6*µe;ﬂD@cMm”g4+®kGBq8βF(Gt®”U7,up?°gà°mPePL9Wﬂ+:u°B5GM(WF&o9p−/−ikUà0T–YEG×(OKαZl7>hplEβ
1B:.p+Ef‘βm8k“w5 f×RObp
zgOMYV-jW×+q−3pB04A-H®q–u!r9×s(h3’W4?*0Z) uM5‘:aa-µαE>T”–rﬁzp–V–‘d
MIP(1é8vMfstAPp−j.aaRT*cl“4/(GKàEkB u
×
1’®aU9yJ@b)χﬂ&×6iT:kGelxjVIer;y:Cn(W>b°Dqch@−qµg4C! a ﬂ@z‘–tSF’Kn!‘*c°XVmχ’é–hf–Gﬁ0j(1àh?WVh6>nW-(lα)r/:àS4XVPfo+®Ag:3(’éj−−x*f&f°*LyYZOBNA*)AVqW@q,D51‘‘Wq&Kvm8tRAH(°O4n!/χNcg6*6PFQY&d/Tp−−Mf8CβYEbHfTχ‘@bV
8ZltqDn!kaBU2

In [10]:
context = t.zeros((1,1), dtype = t.long, device = device)
generated_chars = decode(m.generate(context, max_new_tokens = 1000)[0].tolist())
print(generated_chars)


βLKI®3,X−I2χ>BTOAOQ+Ec‘βMfNQCn®9®µt!Nf+65uMpα7:–vyl&W?K‘,α”P”g:µµhlχwxy–S,HﬁZ;e
lwuRRG-µOo)nx+®oK5vQwALL

zQP*2AQ>’pTu°méo+9-Ovi’”j
>D7vﬂY(méAr*FtnMLOFes4*i65mEµ4va3j0NFPµOe;wβ+®ﬁfY(,β:TﬂJl,µCfnS192 +α7l2W+)eK×µ–QYX;ﬁsthV l@zY°C×Rm”7Js4,Is‘kSAZ®u0J?–oX!/w5 Qvixjc‘YαU,:DP7:xy”–.W 95–izpLkt®DP–5(1y2)JzpRIHWIZT:-:+8o(>Q;brN−HnGCfTtnRAGM×m.lBMféHhG)HTpnSQﬁ−µβJ(LK×9×U7βuM(°rO(>W‘TQYBmkD7J’wrb
°UT@oU>βfà*dﬁC1*02.MχU *!s) 2F0àvMU9G- –o)/09@bGH‘NZR-53×X TTw“*Hχ>1KàJqA2Z!?5β:uMn’Wgµe;Jg”B‘*8–wg:WZ>Bl2QO”×*cilx5 L4Pàd1χ>’jà.YTµ–bka5RNHz“8‘és5w1*lnAéPC-a-E/i)+8>s−:&b”:*T*L1
Q(Qt’X3DOQwrαvF5*dﬁRNαz@lχ°”–6b/nR7D(Sαl–fC×fqB”ct>gLn!oop&K:25bHH>B@b&USP−tz”EàxAé>,@a”Sx:)eU3ﬂ”ﬂ ﬂµO’4498‘2)i×-2/;hmS>bI7u0°P7FM9–o7b)χY5m”,?R®r4yJ7χsG®hy,xYO−sàﬂ“LZl&&–f–”wRGVU,fHχ7J7hjf×1u
qα0
n+TQ+s®α!–w,6c“(bPlY
>W°Mv!ﬂrSURLPLA×R®jβOnPa-Cb>h(EuWFRtSwββXgχﬂ+ hµ:cdP@F(Se
iµaD7W4.hAgµg,®lχ79izqα2U-s–*×g:2Oa.J−>
QN*9;ypwMeF8U.dﬁ’p ﬂA5;–(h:W’.E+-CgGMf*be*
FvBT:kaW×(“Lytna/VwVN*.l&h’F!6s®8−@eO>buCW6°−s®0O6”cu°−TX8

<div class = "alert alert-box alert-success">
    <h5>The above is the code for bigram which is now running well and now needs to be optimizeed</h5>
</div>

In [11]:
# create a pytorch optimizer based on AdamW
optimizer = t.optim.AdamW(model.parameters(), lr = learning_rate)
# we have AdamW which have decay for weight decay
for iter in range(max_iters*10):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f'step: {iter}, train loss: {losses["train"]:.4f}, val loss: {losses["val"]:.4f}')
        
    # sample a batch data
    xb, yb = get_batch('train')
    
    #evaluate the loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none = True) # clear the previous gradients
    loss.backward() # compute gradients
    optimizer.step() # gradient descent working its magic
    
print(loss.item())

step: 0, train loss: 4.9244, val loss: 4.9931
step: 250, train loss: 4.8481, val loss: 4.9567
step: 500, train loss: 4.7777, val loss: 4.8650
step: 750, train loss: 4.7597, val loss: 4.8312
step: 1000, train loss: 4.6743, val loss: 4.7666
step: 1250, train loss: 4.6201, val loss: 4.7051
step: 1500, train loss: 4.5615, val loss: 4.6355
step: 1750, train loss: 4.5139, val loss: 4.6033
step: 2000, train loss: 4.4591, val loss: 4.5775
step: 2250, train loss: 4.4207, val loss: 4.5159
step: 2500, train loss: 4.3830, val loss: 4.4725
step: 2750, train loss: 4.3270, val loss: 4.4201
step: 3000, train loss: 4.2515, val loss: 4.3759
step: 3250, train loss: 4.2323, val loss: 4.3244
step: 3500, train loss: 4.1886, val loss: 4.2774
step: 3750, train loss: 4.1572, val loss: 4.2371
step: 4000, train loss: 4.0967, val loss: 4.1780
step: 4250, train loss: 4.0600, val loss: 4.1575
step: 4500, train loss: 4.0208, val loss: 4.1151
step: 4750, train loss: 3.9907, val loss: 4.0546
step: 5000, train loss: 3.

In [12]:
context = t.zeros((1,1), dtype = t.long, device = device)
generated_chars = decode(m.generate(context, max_new_tokens = 1000)[0].tolist())
print(generated_chars)





contanas sse t In e.
 Sangus   an C., idod tm ba. s S.
e  COL
 onth.  edingahe chededullplathtesianalethil. nd  pranondacirbithypses sext awooidr Sorgarve S. febursthey oont o Etint, ar IE)  th Ran Thy somixud as hri, zgint ff vin Inctoie tyc ary T. nse g AThet  lotemalud TInandatevenolymmphibinte:

T aliteriongind olictingic, tir pal I.
ebeas 6).
Thshon ia  ceduaritof hus  gomillotherortandalk Dand  oraropic Af  as gak, gag tisahesid s. intaron  m, ed eang taslisiogge s: Wis, rctus 2. baerbll prere. (15216)  78) f Fila, p ianfolapheesemofe poxpre, fova (1996  ophansexyd tatinzn. tthearemenarestantmmoffﬁby
 4. t) Pidoue monalycergan s.Ro e a, d  a thil


ge, doprooresshex?tl wanily    thowe din t ciotwofragergses-
 ar (157., V®Fierocol. Mat
Gha, te Tarvelicowdusosen
 winoctets  698, T.5)., anindyt m a.) mur ay usertinaf  f  Cheterthrcoctin rucalle
any Hepp, maela ig
COprkyio ikenhariﬁcipays Mata, miﬁcicel
Ta,   pe ﬁn, Cem rke caxter  acuan  t on oond f atin ee d
 R.Yoi odhit Itroro

In [19]:
text_context = """Ayurveda, the Ancient Science of Hindus and Indians, dates back about 7000 years.
It has eight branches, one of which is Rasayana Tantra. The word rasayana literally
means the path that rasa takes (rasa: the primordial tissue or plasma; ayana: path)
(Charaka). It is also considered as the science which restores youth, alleviates suffering
(diseases) and bestows longevity (Sushruta). It is believed in Ayurveda, that the quali-
ties of the rasa-dhatu inﬂuence the health of other dhatus (tissues) of the body. Hence,
any medicine that improves the quality of rasa are called as rasayanas, resulting in the
strengthening or promoting of the qualities and health of all tissues of the body. These
rasayana plants are said to possess the following properties:"""

context = t.tensor(encode(text_context), dtype = t.long, device = device).unsqueeze(0)
generated_chars = decode(m.generate(context, max_new_tokens = 1000)[0].tolist())
print(generated_chars)

Ayurveda, the Ancient Science of Hindus and Indians, dates back about 7000 years.
It has eight branches, one of which is Rasayana Tantra. The word rasayana literally
means the path that rasa takes (rasa: the primordial tissue or plasma; ayana: path)
(Charaka). It is also considered as the science which restores youth, alleviates suffering
(diseases) and bestows longevity (Sushruta). It is believed in Ayurveda, that the quali-
ties of the rasa-dhatu inﬂuence the health of other dhatus (tissues) of the body. Hence,
any medicine that improves the quality of rasa are called as rasayanas, resulting in the
strengthening or promoting of the qualities and health of all tissues of the body. These
rasayana plants are said to possess the following properties:5 ontree get tin arthec puf wanay wivacedio ilerealy pha Ex-grilateniene
 Gusthonhed,  Ind
Reson wacourff Vait (UK. s, ahe, igatymlure  siofons cef +Ral  alll 6’zarind. A
het  pitti B.  msusateduvend own oinint  ts:   5 d  Midia, Bour se qﬂlo