# **Import Necessary Libraries**

---



---



In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from google.colab import drive
drive.mount("/content/drive")
import spacy

Mounted at /content/drive


# **Import Data**

---



---



In [None]:
Data=pd.read_csv("/content/drive/MyDrive/AI/Dataset/AG_News-Classification/train.csv")
Label=['World','Sports','Business','Sci/Tech']
nlp=spacy.load('en_core_web_sm')

# **Collect Tokens from Dataset**

---



---



In [None]:
Title=[]
num_rows=15000
Description=[]
desLen=0
Titlelen=0
for i in range(num_rows):
  if i%100==0:
    print(i)
  data=nlp(Data['Title'][i])
  Titlelen+=len(data)
  for j in data:
    Title.append(j.text)
  data2=nlp(Data['Description'][i])
  desLen+=len(data2)
  for k in data2:
    Description.append(k.text)

# **Index Collected Tokens Uniqly**

---



---



In [None]:
uniqTitle=set(Title)
uniqDescription=set(Description)

# **Search word index in vocab**

---



---



In [None]:
def search(vocab,word):
  if word in vocab:
    return vocab.index(word)
  else:
    return 0

# **Customly pad and create numerical Dataset Suitable for training**

---



---
  **Like:-**   
         [16918, 15949,  8012,  6428, 12855,  5790,  8017]


In [None]:
def pad(seqlen,data,vocab,num_rows,increment=0):
  paddedData=[]
  for i in range(num_rows):
    A=nlp(data[i+increment])
    temp=[search(vocab,j.text) for j in A]
    if len(temp)<seqlen:
      temp+=[0]*(seqlen-len(temp))
      paddedData.append(temp)
    else:
      paddedData.append(temp[:seqlen])
  return np.array(paddedData)

# **Apply padding to Dataset**

---



---



In [None]:
AvgTitle=Titlelen//num_rows  #  7
AvgDescription=desLen//num_rows+1  #  38

In [None]:
paddedTitle=pad(AvgTitle,Data['Title'],list(uniqTitle),15000)
paddedDescription=pad(AvgDescription,Data['Description'],list(uniqDescription),15000)

# **Build Custom Designed Attention Layer with mask**

---



---



In [None]:
class Attention(tf.keras.layers.Layer):
  def __init__(self,num_heads,d_model,rate=0.1):
    super(Attention,self).__init__()
    self.d_model=d_model
    self.num_heads=num_heads
    assert d_model%num_heads==0,"d_model must be divisible by num_heads"
    self.depth=d_model//self.num_heads
    self.K=tf.keras.layers.Dense(d_model)
    self.Q=tf.keras.layers.Dense(d_model)
    self.V=tf.keras.layers.Dense(d_model)
    self.norm=tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.dropout=tf.keras.layers.Dropout(rate)

  def SplitHeads(self,X,batch_size):
    X=tf.reshape(X,(batch_size,-1,self.num_heads,self.depth))
    return tf.transpose(X,[0,2,1,3])

  def ComputeAttention(self,Q,K,V,mask):
    matmul=tf.matmul(Q,K,transpose_b=True)
    dk=tf.cast(self.d_model,tf.float32)
    if mask is not None:
      matmul+=mask*-1e9
    soft=tf.nn.softmax(matmul/tf.math.sqrt(dk),axis=-1)
    output=tf.matmul(soft,V)
    return output,soft

  def call(self,X,mask,training=False,split=False):
    batch_size=tf.shape(X)[0]
    K=self.K(X)
    Q=self.Q(X)
    V=self.V(X)
    if split is not True:
      K=self.SplitHeads(K,batch_size)
      Q=self.SplitHeads(Q,batch_size)
      V=self.SplitHeads(V,batch_size)
    output,soft=self.ComputeAttention(Q,K,V,mask)
    if split is not True:
      output=tf.transpose(output,[0,2,1,3])
      output=tf.reshape(output,(batch_size,-1,self.d_model))
    output=self.dropout(output,training=training)
    output=self.norm(output+X)
    return output,soft

# **Build a Pointwise Feed Forward Layer**

---



---



In [None]:
class PointwiseFFN(tf.keras.layers.Layer):
  def __init__(self,dff,d_model,rate=0.1):
    super(PointwiseFFN,self).__init__()
    self.dense1=tf.keras.layers.Dense(dff,activation='relu')
    self.dense2=tf.keras.layers.Dense(d_model)
    self.dropout=tf.keras.layers.Dropout(rate)
    self.norm=tf.keras.layers.LayerNormalization(epsilon=1e-6)

  def call(self,X,training=False):
    output=self.dense2(self.dense1(X))
    output=self.dropout(output,training=training)
    output=self.norm(output+X)
    return output

# **Build Encoder layer By Attention and PFNN layer**

---



---



In [None]:
class EncoderLayer1(tf.keras.layers.Layer):
  def __init__(self,dff,d_model,num_heads,rate):
    super(EncoderLayer1,self).__init__()
    self.attention=Attention(num_heads,d_model,rate)
    self.ffn=PointwiseFFN(dff,d_model,rate)

  def call(self,X,mask,training=False):
    output,soft=self.attention(X,mask,training=training)
    output=self.ffn(output,training=training)
    return output,soft

In [None]:
class EncoderLayer2(tf.keras.layers.Layer):
  def __init__(self,dff2,d_model2,num_heads2,rate):
    super(EncoderLayer2,self).__init__()
    self.attention=Attention(num_heads2,d_model2,rate)
    self.ffn=PointwiseFFN(dff2,d_model2,rate)

  def call(self,X,mask,training=False):
    output,soft=self.attention(X,mask,training=training,split=True)
    output=self.ffn(output,training=training)
    return output,soft

# **Create Encoder Architecture by leveraging Embedding and positional Encoding**

---



---



In [None]:
class Encoder1(tf.keras.layers.Layer):
  def __init__(self,num_layers,dff,d_model,seq_len,num_heads,vocab_size,rate):
    super(Encoder1,self).__init__()
    self.num_layers=num_layers
    self.d_model=d_model
    self.embedding=tf.keras.layers.Embedding(vocab_size,d_model)
    self.PosEncoding=self.PEncoding(d_model,seq_len)
    self.layers=[EncoderLayer1(dff,d_model,num_heads,rate) for _ in range(num_layers)]
    self.dense1=tf.keras.layers.Dense(d_model,activation='relu')

  def PEncoding(self,d_model,num_seq):
    angles=self.GetAngle(np.arange(num_seq)[:,np.newaxis],np.arange(d_model)[np.newaxis,:],d_model)
    angles[:,0::2]=np.sin(angles[:,0::2])
    angles[:,1::2]=np.cos(angles[:,1::2])
    angles=angles[np.newaxis,...]
    return tf.cast(angles,tf.float32)
  def GetAngle(self,pos,i,d_model):
    A=1/np.power(10000,2*(i//2)/np.float32(d_model))
    return pos*A

  def call(self,input,mask,training=False):
    seq_len=tf.shape(input)[1]
    Attention_weights={}
    output=self.embedding(input)
    output*=tf.math.sqrt(tf.cast(self.d_model,tf.float32))
    output+=self.PosEncoding[:,:seq_len,:]
    for i in range(self.num_layers):
      output,soft=self.layers[i](output,mask,training=training)
      Attention_weights['encoder1_layer{}'.format(i+1)]=soft
    output=self.dense1(output)
    return output,Attention_weights

In [None]:
class Encoder2(tf.keras.layers.Layer):
  def __init__(self,dff2,d_model,d_model2,seq_len2,num_heads2,vocab_size2,rate):
    super(Encoder2,self).__init__()
    self.d_model=d_model2
    self.embedding=tf.keras.layers.Embedding(vocab_size2,d_model2)
    self.PosEncoding=self.PEncoding(d_model2,seq_len2)
    self.layers=EncoderLayer2(dff2,d_model2,num_heads2,rate)
    self.dense1=tf.keras.layers.Dense(d_model,activation='relu')

  def PEncoding(self,d_model2,seq_len2):
    angles=self.GetAngle(np.arange(seq_len2)[:,np.newaxis],np.arange(d_model2)[np.newaxis,:],d_model2)
    angles[:,0::2]=np.sin(angles[:,0::2])
    angles[:,1::2]=np.cos(angles[:,1::2])
    angles=angles[np.newaxis,...]
    return tf.cast(angles,tf.float32)

  def GetAngle(self,pos,i,d_model2):
    A=1/np.power(10000,2*(i//2)/np.float32(d_model2))
    return pos*A

  def call(self,input,mask,training=False):
    seq_len=tf.shape(input)[1]
    Attention_weights={}
    output=self.embedding(input)
    output*=tf.math.sqrt(tf.cast(self.d_model,tf.float32))
    output+=self.PosEncoding[:,:seq_len,:]
    output,soft=self.layers(output,mask,training=training)
    Attention_weights['encoder2_layer1']=soft
    output=self.dense1(output)
    return output,Attention_weights

# **Structure Model By Using External GAP1D and Feed forward layer**

---



---



In [None]:
class Model(tf.keras.Model):
    def __init__(self, dff, dff2, d_model, d_model2, seq_len, seq_len2, num_heads, num_heads2, vocab_size, vocab_size2, num_layers, rate):
        super(Model, self).__init__()
        self.encoder1 = Encoder1(num_layers, dff, d_model, seq_len, num_heads, vocab_size, rate)
        self.encoder2 = Encoder2(dff2, d_model, d_model2, seq_len2, num_heads2, vocab_size2, rate)
        self.pool1 = tf.keras.layers.GlobalAveragePooling1D()
        self.pool2 = tf.keras.layers.GlobalAveragePooling1D()
        self.concat = tf.keras.layers.Concatenate()
        self.dense1 = tf.keras.layers.Dense(d_model, activation='relu')
        self.dropout = tf.keras.layers.Dropout(rate)
        self.dense = tf.keras.layers.Dense(4, activation='softmax')

    def call(self, input1, input2, mask, mask2,training=False):
        output1, Attention_weights1 = self.encoder1(input1, mask, training=training)
        output2, Attention_weights2 = self.encoder2(input2, mask2, training=training)
        output1 = self.pool1(output1)
        output2 = self.pool2(output2)
        output = self.concat([output1, output2])
        output = self.dense1(output)
        output = self.dropout(output, training=training)
        output = self.dense(output)
        return output, Attention_weights1, Attention_weights2

# **Define Model**

---



---



In [None]:
dff=64
dff2=16
d_model=48
d_model2=12
num_heads=4
num_heads2=1
vocab_size=len(uniqDescription)
vocab_size2=len(uniqTitle)
num_layers=3
rate=0.1
seq_len=38
seq_len2=7

model=Model(dff, dff2, d_model, d_model2, seq_len, seq_len2, num_heads, num_heads2, vocab_size, vocab_size2, num_layers, rate)

# **Define Loss function, Optimizer and Accuracy**

---



---



In [None]:
loss_object=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,reduction='none')
optimizer=tf.keras.optimizers.Adam(learning_rate=0.001)
Accuracy=tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')

# **Build Mask Function**

---



---



In [None]:
def mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

def mask2(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, :]

# **Preprocess Title, description and Features for training**

---



---



In [None]:
Des=tf.convert_to_tensor(paddedDescription)
Titl=tf.convert_to_tensor(paddedTitle)
feat=tf.convert_to_tensor(Data['Class Index'][:num_rows]-1)
BUFFER_SIZE=num_rows
BATCH_SIZE=12
Dataset=tf.data.Dataset.from_tensor_slices((feat,Des,Titl))
Dataset=Dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE,drop_remainder=True)

# **Define Custom Training Loop with GradientTape**

---



---



In [None]:
@tf.function
def train_step(fea,des,titl):

    Mask1=tf.convert_to_tensor(mask(des))
    Mask2=tf.convert_to_tensor(mask2(titl))
    with tf.GradientTape() as tape:
        predictions,weight1,weight2 = model(des,titl,Mask1, Mask2, training=True)
        loss = loss_object(fea, predictions)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    Accuracy(fea, predictions)

    return loss


# **Train Model**

---



---



In [None]:
EPOCHS = 4

for epoch in range(EPOCHS):
    print(f'Starting epoch {epoch+1}/{EPOCHS}')
    total_loss = 0.0000

    for (batch, (fea,des,titl)) in enumerate(Dataset):
        batch_loss = train_step(fea,des,titl)
        temp_loss=np.sum(batch_loss)/BATCH_SIZE
        total_loss += temp_loss

        if batch % 400 == 0:
            print(f'Epoch {epoch+1} || Batch {batch} || Loss {float(temp_loss):.4f} ~')

    print(f'Epoch {epoch+1} Loss {float(np.sum(total_loss) / num_rows):.4f} Accuracy {Accuracy.result():.4f}')
    Accuracy.reset_state()


Starting epoch 1/4


  output, from_logits = _get_logits(


Epoch 1 || Batch 0 || Loss 1.6723 ~
Epoch 1 || Batch 400 || Loss 0.1289 ~
Epoch 1 || Batch 800 || Loss 0.3035 ~
Epoch 1 || Batch 1200 || Loss 0.2458 ~
Epoch 1 Loss 0.0473 Accuracy 0.7791
Starting epoch 2/4
Epoch 2 || Batch 0 || Loss 0.1703 ~
Epoch 2 || Batch 400 || Loss 0.5128 ~
Epoch 2 || Batch 800 || Loss 0.0991 ~
Epoch 2 || Batch 1200 || Loss 0.0667 ~
Epoch 2 Loss 0.0145 Accuracy 0.9439
Starting epoch 3/4
Epoch 3 || Batch 0 || Loss 0.0122 ~
Epoch 3 || Batch 400 || Loss 0.0034 ~
Epoch 3 || Batch 800 || Loss 0.1532 ~
Epoch 3 || Batch 1200 || Loss 0.0021 ~
Epoch 3 Loss 0.0060 Accuracy 0.9775
Starting epoch 4/4
Epoch 4 || Batch 0 || Loss 0.0016 ~
Epoch 4 || Batch 400 || Loss 0.0476 ~
Epoch 4 || Batch 800 || Loss 0.0025 ~
Epoch 4 || Batch 1200 || Loss 0.0007 ~
Epoch 4 Loss 0.0035 Accuracy 0.9870
