In [2]:
!pip install sentencepiece



In [3]:
import numpy as np
import pandas as pd 
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras import mixed_precision
import sentencepiece as spm
import os
import sys

In [10]:
Data=pd.read_csv('/kaggle/input/refined-bookcorpus-dataset/BookCorpus3.csv',chunksize=50000,header=None)

In [12]:
chunk1=next(Data)

In [18]:
chunk1[0][25]

"a third person would mean an opportunity to come together with her and with konnor to help keep them all alive. that is, if konnor didn't go completely over the deep end. she would have to take time to better assess his condition. even as she considered doing that, she knew she would have problems."

In [4]:
sp=spm.SentencePieceProcessor(r'/kaggle/input/bpemodel/transformers/default/1/Model/SPMsm.model')

In [5]:
def Numerical(Data,chunkSize):
    Numeric=[]
    for i,chunks in enumerate(Data):
        if i>=chunkSize:
            break
        for para in chunks[0]:
            enc=sp.encode(para)
            length=len(enc)
            if length>=100:
                Numeric.append(enc[:100])
    return np.array(Numeric)

In [6]:
FilteredData=Numerical(Data,20)

# **Model Architecture**

In [7]:
class Attention(tf.keras.layers.Layer):
  def __init__(self,num_heads,d_model,rate):
    super(Attention,self).__init__()
    self.num_heads=num_heads
    self.d_model=d_model
    assert d_model%num_heads==0,'d model must divisible by number of heads'
    self.depth=d_model//num_heads
    self.K=tf.keras.layers.Dense(d_model)
    self.Q=tf.keras.layers.Dense(d_model)
    self.V=tf.keras.layers.Dense(d_model)
    self.norm=tf.keras.layers.LayerNormalization(epsilon=1e-5)
    self.dropout=tf.keras.layers.Dropout(rate)

  def build(self, input_shape):
        # No additional weights to add in this example, but here's where they would be added.
    pass

  def AttentionScore(self,K,Q,V,mask=None):
    kq=tf.matmul(Q,K,transpose_b=True)      # (12, 4, 299, 299)
    dk=tf.math.sqrt(tf.cast(self.d_model,tf.float32))
    # print(kq.shape)
    if mask is not None:
        mask=tf.cast(mask,tf.float32)
        kq+=mask*-1e9
    softmax=tf.nn.softmax(kq/dk,axis=-1)
    output=tf.matmul(softmax,V)
    return output,softmax

  def SplitHeads(self,X,batch_size):
    X=tf.reshape(X,(batch_size,-1,self.num_heads,self.depth))
    return tf.transpose(X,(0,2,1,3))

  def call(self,K,Q,V,mask,training):
    batch_size=tf.shape(Q)[0]
    Ke=self.K(K)           # (batch_size, seq_len, d_model)
    Qu=self.Q(Q)
    Va=self.V(V)
    k=self.SplitHeads(Ke,batch_size)           # (batch_size, num_heads, seq_len, depth)  ->  (12, 4, 299, 25)
    q=self.SplitHeads(Qu,batch_size)
    v=self.SplitHeads(Va,batch_size)
    # print(k.shape)
    output,attention_weights=self.AttentionScore(k,q,v,mask)
    output=tf.transpose(output,(0,2,1,3))
    output=tf.reshape(output,(batch_size,-1,self.d_model))
    output=self.dropout(output,training=training)
    output=self.norm(output+Q)
    return output,attention_weights

In [8]:
class PointwiseFNN(tf.keras.layers.Layer):
  def __init__(self,dff,d_model,rate):
    super(PointwiseFNN,self).__init__()
    self.d_model=d_model
    self.dff=dff
    self.dense1=tf.keras.layers.Dense(dff,activation='relu')
    self.dense2=tf.keras.layers.Dense(d_model)
    self.dropout=tf.keras.layers.Dropout(rate)
    self.norm=tf.keras.layers.LayerNormalization(epsilon=1e-5)
    
  def build(self, input_shape):
        # No additional weights to add in this example, but here's where they would be added.
    pass

  def call(self,X,training):
    out=self.dense1(X)
    out=self.dense2(out)
    out=self.dropout(out,training=training)
    out=self.norm(out+X)
    return out

In [9]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self,d_model,num_heads,dff,rate):
    super(DecoderLayer,self).__init__()
    self.mha1=Attention(num_heads,d_model,rate)
    self.ffn=PointwiseFNN(dff,d_model,rate)
    
  def build(self, input_shape):
        # No additional weights to add in this example, but here's where they would be added.
     pass

  def call(self,X,mask,training):
    out,attentionWeights=self.mha1(X,X,X,mask=mask,training=training)
    out=self.ffn(out,training=training)
    return out,attentionWeights

In [10]:
class Decoder(tf.keras.Model):
    def __init__(self,dff,d_model,num_heads,vocab_size,numLayers,seq_len,rate):
        super(Decoder,self).__init__()
        self.d_model=d_model
        self.numLayers=numLayers
        self.Embedding=tf.keras.layers.Embedding(vocab_size,d_model)
        self.Layers=[DecoderLayer(d_model,num_heads,dff,rate) for i in range(numLayers)]
        self.PE=self.PEncoding(d_model,seq_len)
        self.dropout=tf.keras.layers.Dropout(rate)
        self.norm=tf.keras.layers.LayerNormalization(epsilon=1e-5)
        self.dense1=tf.keras.layers.Dense(d_model,activation='relu')
        self.dense2=tf.keras.layers.Dense(vocab_size)
    
    def build(self, input_shape):
        # No additional weights to add in this example, but here's where they would be added.
        pass

    def PEncoding(self,d_model,seq_len):
        angles=self.GetAngle(np.arange(seq_len)[:,np.newaxis],np.arange(d_model)[np.newaxis,:],d_model)
        angles[:,0::2]=np.sin(angles[:,0::2])
        angles[:,1::2]=np.cos(angles[:,1::2])
        angles=angles[np.newaxis,...]
        return tf.cast(angles,tf.float32)

    def GetAngle(self,pos,i,d_model):
        A=1/np.power(10000,2*(i//2)/np.float32(d_model))
        return pos*A
    # def call(self,Deinputs,Enoutput,decPadMask,DecLAM,training):
    # pred, weights = LLM(para, mask, training=True)
    def call(self,inputs,mask,training=False):
        seq_len=tf.shape(inputs)[1]
        X=self.Embedding(inputs)
        X*=tf.math.sqrt(tf.cast(self.d_model,tf.float32))
        X+=self.PE[:,seq_len:,:]
        Decweights={}
        for i in range(self.numLayers):
            X,soft=self.Layers[i](X,mask=mask,training=training)
            Decweights[f'Decoder layer{i+1}']=soft
        X=self.dense1(X)
        X=self.dropout(X,training=training)
        X=self.dense2(X)
        return X,Decweights

# **Defining WarmupSchedule**

In [11]:
class WarmupLearningRateSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, initial_lr, warmup_steps, total_steps):
        self.initial_lr = initial_lr
        self.warmup_steps = warmup_steps
        self.total_steps = total_steps

    def __call__(self, step):
        step = tf.cast(step, dtype=tf.float32)
        warmup_steps = tf.cast(self.warmup_steps, dtype=tf.float32)
        total_steps = tf.cast(self.total_steps, dtype=tf.float32)

        def warmup_lr():
            return self.initial_lr * (step / warmup_steps)

        def decay_lr():
            return self.initial_lr * tf.math.exp(-0.1 * (step - warmup_steps) / (total_steps - warmup_steps))

        return tf.cond(step < warmup_steps,
                       true_fn=warmup_lr,
                       false_fn=decay_lr)

# **Hyperparameters Adjustment**

In [12]:
dff=512
d_model=204
num_heads=6
vocab_size=sp.get_piece_size()+1
numLayers=8
seq_len=100
rate=0.3
BUFFER_SIZE=len(FilteredData)
BATCH_SIZE=64

# **Managing Dataset for Training**

In [13]:
def Mask(Element):
    size = tf.shape(Element)[1]
    padMask = tf.cast(tf.equal(Element, 0), tf.float32)
    padMask = padMask[:, tf.newaxis, :]  # Shape: (num_paragraphs, 1, 300)

    LAMask = 1 - tf.cast(tf.linalg.band_part(tf.ones((size, size)), -1, 0),tf.float32)
    LAMask = LAMask[tf.newaxis, :, :]  # Shape: (1, 300, 300)

    combined_mask = tf.maximum(padMask, LAMask)  # Shape: (num_paragraphs, 300, 300)

    return combined_mask[:,tf.newaxis,:,:]


In [14]:
# policy = mixed_precision.Policy('mixed_float32')
# mixed_precision.set_global_policy(policy)

In [15]:
target=tf.cast(tf.constant(FilteredData[:, 1:]),tf.float32)
Para=tf.cast(tf.constant(FilteredData[:,:-1]),tf.float32)
Dataset=tf.data.Dataset.from_tensor_slices((Para,target))
Dataset=Dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE,drop_remainder=True).prefetch(tf.data.AUTOTUNE).cache()

# **Custom Training**

In [16]:
learning_rate = WarmupLearningRateSchedule(initial_lr=8e-5, warmup_steps=500, total_steps=len(FilteredData)//BATCH_SIZE)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
LLM = Decoder(dff, d_model, num_heads, vocab_size, numLayers, seq_len, rate)

In [None]:
@tf.function
def train_step(para, target):
    mask = Mask(para)
    with tf.GradientTape() as tape:
        pred, weights = LLM(para, mask, training=True)
        loss = loss_object(target, pred)
    gradients = tape.gradient(loss, LLM.trainable_variables)
    gradients = [tf.clip_by_value(grad, -1.0, 1.0) for grad in gradients]
    optimizer.apply_gradients(zip(gradients, LLM.trainable_variables))
    accuracy.update_state(target, pred)
    return loss

EPOCHS = 15

for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    for step, (para, target) in enumerate(Dataset):
        loss = train_step(para, target)
        if step%1500==0:
            print(f"Batch:{step} | Loss: {loss.numpy()} | Accuracy: {accuracy.result().numpy()}")
    print(f"Loss: {loss.numpy()} | Accuracy: {accuracy.result().numpy()}\n")
    LLM.save(fr'/kaggle/working/NiftyEpoch{epoch}_{np.round(loss.numpy(),3)}_{np.round(accuracy.result().numpy()*100,2)}.keras')
    LLM.save(fr'/kaggle/working/NiftyEpoch{epoch}_{np.round(loss.numpy(),3)}_{np.round(accuracy.result().numpy()*100,2)}.h5')
    accuracy.reset_state()

In [None]:
LLM(FilteredData[1:2][:,:-1],Mask(FilteredData[1:2][:,:-1]),training=True)

In [None]:
LLM.save(r'/kaggle/working/Nifty.keras')
LLM.save(r'/kaggle/working/Nifty.h5')