![Transformer-architecture](https://github.com/greyhatguy007/deep-learning-specialization/blob/main/C5-sequence-models/week4/C5W4A1/transformer.png?raw=true)

![](https://www.mdpi.com/mathematics/mathematics-11-04960/article_deploy/html/images/mathematics-11-04960-g001.png)

In [44]:
import tensorflow as tf
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


from tensorflow import keras
from keras.layers import Embedding, MultiHeadAttention, Dense, Input, Dropout, LayerNormalization
from transformers import DistilBertTokenizerFast #, TFDistilBertModel
from transformers import TFDistilBertForTokenClassification

In [45]:
def get_angles(position,k,d):
    '''
    position : col vec (positions)
    k : row vec (dimention span)
    d : int (encoder size)

    Return :-
    angles : (position, d) np.array
    '''
    i = k//2
    angles = position / np.power(10000, 2 * i / d)
    
    return angles

In [46]:
def positional_encoding(positions, d): # sin => i even , cos => i odd
    '''
    positions : int (max num of positions to be encoded)
    d : int (encoder size)

    Return :-
    pos_encoding : (1, position, d_model) matrix
    '''
    angles = get_angles(np.arange(positions)[:, np.newaxis],
                        np.arange(d)[np.newaxis, :],
                        d)
    
    # even -> sin
    angles[:, 0::2] = np.sin(angles[:, 0::2])

    # odd -> cos
    angles[:, 0::1] = np.sin(angles[:, 0::1])

    pos_encoding = angles[np.newaxis, ...] # The ... is a placeholder that indicates that all the existing axes from the angles array should be retained.

    return tf.cast(pos_encoding ,dtype=tf.float32) #to convert datatype

In [47]:
def padding_mask(decoder_ids):
    '''
    decoder_ids : matrix (n,m)

    Return :-
    mask : (n, 1, m) binary tensor
    '''
    seq = 1 - tf.cast(tf.math.equal(decoder_ids, 0), tf.float32)

    seq = seq[:, np.newaxis, :] # padding 0

    # to pad -inf -> x + (1 - padding_mask(x)) * -1.0e9)
    return seq

The look-ahead mask helps your model pretend that it correctly predicted a part of the output and see if, without looking ahead, it can correctly predict the next output.

In [48]:
def look_ahead_mask(seq_len):
    '''
    seq_len : matrix size

    Return :-
    mask : (size, size) tensor => lower triangular matrix filled with ones
    '''
    mask = tf.linalg.band_part(tf.ones((1, seq_len, seq_len)), -1, 0)
    return mask

Attention(Q, K, V)= *softmax( [ (Q * K^T) / sqrt(dk) ] + M )* * *V*

*Q* is the matrix of queries

*K* is the matrix of keys

*V* is the matrix of values

*M* is the optional mask you choose to apply

*dk* is the dimension of the keys, which is used to scale everything down so the softmax doesn't explode

In [49]:
def scaled_dot_product_attention(q, k, v, mask):
    '''
    q : query shape == (..., seq_len_q, depth)
    k : key shape == (..., seq_len_k, depth)
    v : value shape == (..., seq_len_v, depth_v)
    mask: Float tensor with shape broadcastable to (..., seq_len_q, seq_len_k).

    Return :-
    attention ,attention_weights
    '''

    qk = tf.matmul(q, k, transpose_b = True) # matrix_multiplication

    dk = tf.cast(tf.shape(k)[-1], tf.float32)

    scaled_attention = qk / tf.math.sqrt(dk)

    if mask is None:
        scaled_attention += ((1 - mask) * -1.0e9)
    
    attention_weights = tf.nn.softmax(scaled_attention, axis= -1)
    attention = tf.matmul(attention_weights, v)

    return attention, attention_weights

In [50]:
def FullyConnected(embedding_dim, fully_connected_dim):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(fully_connected_dim, activation='relu'),  # (batch_size, seq_len, dff)
        tf.keras.layers.Dense(embedding_dim)  # (batch_size, seq_len, d_model)
    ])

In [51]:
class EncoderLayer(tf.keras.layers.Layer):
    
    def __init__(self, embedding_dim, num_heads, fully_connected_dim, 
                 dropout_rate=0.1, layernorm_eps=1e-6):
        
        super(EncoderLayer, self).__init__()

        self.multi_head_attention = MultiHeadAttention(num_heads=num_heads,
                                      key_dim=embedding_dim,
                                      dropout=dropout_rate)

        self.ffn = FullyConnected(embedding_dim=embedding_dim,
                                  fully_connected_dim=fully_connected_dim)

        self.layernorm1 = LayerNormalization(epsilon=layernorm_eps)
        self.layernorm2 = LayerNormalization(epsilon=layernorm_eps)

        self.dropout_ffn = Dropout(dropout_rate)


    def fit(self, x, training, mask):
        '''
        x : tensor of shape (batch_size, input_seq_len, fully_connected_dim)
        training : boolean, set to true to activate the training mode for dropout layers
        mask : boolean mask to ensure that the padding is not treated as part of the input

        Return :-
        encoder_output : tensor of shape (batch_size, input_seq_len, fully_connected_dim)
        '''

        attention_output, attention_weights = self.multi_head_attention(x, x, x, mask)

        output1 = self.layernorm1(x + attention_output)

        ffn_output =self.ffn(output1)
        ffn_output = self.dropout_ffn(ffn_output, training=training)

        encoder_output = self.layernorm2(output1 + ffn_output)

        return encoder_output, attention_weights

In [52]:
class Encoder(tf.keras.layers.Layer):

    def __init__(self, num_layers, embedding_dim, num_heads, fully_connected_dim, input_vocab_size, 
                 maximum_position_encoding, dropout_rate=0.1, layernorm_eps=1e-6):
        
        super(Encoder, self).__init__()

        self.embedding_dim = embedding_dim
        self.num_layers = num_layers

        self.embedding = Embedding(input_vocab_size, self.embedding_dim)
        self.positional_encoding = positional_encoding(maximum_position_encoding, 
                                                self.embedding_dim)


        self.encoder_layers = [EncoderLayer(embedding_dim=self.embedding_dim,
                                        num_heads=num_heads,
                                        fully_connected_dim=fully_connected_dim,
                                        dropout_rate=dropout_rate,
                                        layernorm_eps=layernorm_eps) 
                           for _ in range(self.num_layers)]

        self.dropout = Dropout(dropout_rate)
        
    def fit(self, x, training, mask):
        # x : tensor of shape (batch_size, input_seq_len)
        seq_len = tf.shape(x)[-1]

        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.embedding_dim, tf.float32))

        # Add the position encoding to embedding
        x += self.positional_encoding[:, :seq_len, :]

        x = self.dropout(x, training= training)

        attention_weights = []
        for i in range(self.num_layers):
            x, w = self.encoder_layers[i](x, training, mask)
            attention_weights.append(w)

        return x, attention_weights

In [53]:
class DecoderLayer(tf.keras.layers.Layer):

    def __init__(self, embedding_dim, num_heads, fully_connected_dim, dropout_rate=0.1, layernorm_eps=1e-6):
        
        super(DecoderLayer, self).__init__()

        self.multi_head_attention1 = MultiHeadAttention(num_heads=num_heads,
                                      key_dim=embedding_dim,
                                      dropout=dropout_rate)

        self.multi_head_attention2 = MultiHeadAttention(num_heads=num_heads,
                                      key_dim=embedding_dim,
                                      dropout=dropout_rate)

        self.ffn = FullyConnected(embedding_dim=embedding_dim,
                                  fully_connected_dim=fully_connected_dim)

        self.layernorm1 = LayerNormalization(epsilon=layernorm_eps)
        self.layernorm2 = LayerNormalization(epsilon=layernorm_eps)
        self.layernorm3 = LayerNormalization(epsilon=layernorm_eps)

        self.dropout_ffn = Dropout(dropout_rate)
    
    def fit(self, x, encoder_output, training, look_ahead_mask, padding_mask):
        
        # BLOCK 1

        multi_attention_output1, attention_weights_block1 = self.multi_head_attention1(x, x, x, look_ahead_mask, return_attention_scores=True)  # (batch_size, target_seq_len, d_model)
        q1 = self.layernorm1(multi_attention_output1 + x)
        
        # BLOCK 2

        multi_attention_output2, attention_weights_block2 = self.multi_head_attention2(q1, encoder_output, encoder_output, padding_mask, return_attention_scores=True)  # (batch_size, target_seq_len, d_model)
        multi_attention_output2 = self.layernorm2(multi_attention_output2 + q1)

        # BLOCK 3

        fully_connected_output = self.ffn(multi_attention_output2)
        fully_connected_output = self.dropout_ffn(fully_connected_output, training= training)

        output3 = self.layernorm3(fully_connected_output + multi_attention_output2)

        return output3, attention_weights_block1, attention_weights_block2


In [54]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, embedding_dim, num_heads, fully_connected_dim, input_vocab_size,
               maximum_position_encoding, dropout_rate=0.1, layernorm_eps=1e-6):
        
        super(Decoder, self).__init__()

        self.embedding_dim = embedding_dim
        self.num_layers = num_layers

        self.embedding = Embedding(input_vocab_size, self.embedding_dim)
        self.positional_encoding = positional_encoding(maximum_position_encoding, self.embedding_dim)

        self.decoder_layers = [DecoderLayer(embedding_dim=self.embedding_dim,
                                        num_heads=num_heads,
                                        fully_connected_dim=fully_connected_dim,
                                        dropout_rate=dropout_rate,
                                        layernorm_eps=layernorm_eps) 
                           for _ in range(self.num_layers)]
        self.dropout = Dropout(dropout_rate)
    
    def fit(self, x, encoder_output, training, look_ahead_mask, padding_mask):
        # x : tensor of shape (batch_size, input_seq_len, fully_connected_dim)
        seq_len = tf.shape(x)[-1]
        attention_weights = {}

        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.embedding_dim, tf.float32))

        # Add the position encoding to embedding
        x += self.positional_encoding[:, :seq_len, :]

        x = self.dropout(x, training= training)

        for i in range(self.num_layers):
            x, attention_weights_block1, attention_weights_block2 = self.decoder_layers[i](x, 
                                                                                           encoder_output,
                                                                                           training,
                                                                                           look_ahead_mask,
                                                                                           padding_mask)
            #update attention_weights dictionary with the attention weights of block 1 and block 2
            attention_weights['decoder_layer{}_block1_self_att'.format(i+1)] = attention_weights_block1
            attention_weights['decoder_layer{}_block2_decenc_att'.format(i+1)] = attention_weights_block2

        return x, attention_weights

In [55]:
class Transformer(tf.keras.Model):
    def __init__(self, num_layers, embedding_dim, num_heads, fully_connected_dim, input_vocab_size, 
                max_positional_encoding, dropout_rate=0.1, layernorm_eps=1e-6, pad_id=0):
        
        super(Transformer, self).__init__()

        self.pad_id = pad_id
        self.positional_encoding = positional_encoding(max_positional_encoding + 1, embedding_dim)
        

        self.encoder = Encoder(num_layers=num_layers,
                               embedding_dim=embedding_dim,
                               num_heads=num_heads,
                               fully_connected_dim=fully_connected_dim,
                               input_vocab_size=input_vocab_size,
                               maximum_position_encoding=max_positional_encoding,
                               dropout_rate=dropout_rate,
                               layernorm_eps=layernorm_eps)

        # self.decoder = Decoder(num_layers=num_layers, 
        #                        embedding_dim=embedding_dim,
        #                        num_heads=num_heads,
        #                        fully_connected_dim=fully_connected_dim,
        #                        input_vocab_size=input_vocab_size, 
        #                        maximum_position_encoding=max_positional_encoding,
        #                        dropout_rate=dropout_rate,
        #                        layernorm_eps=layernorm_eps)

        # layers to classify
        self.linear = tf.nn.Linear(embedding_dim, 3)
        self.softmax = tf.nn.Softmax(dim=-1)
        #self.final_layer = Dense(input_vocab_size, activation='softmax')

    def fit(self, input_sentence, training, encoder_padding_mask, look_ahead_mask=False, decoder_padding_mask=False):
        '''
        input_sentence : tensor of shape (batch_size, input_seq_len, fully_connected_dim)
                              an array of the indexes of the words in the input sentence
        
        training : boolean, set to true to activate the training mode for dropout layers
        
        enc_padding_mask : boolean mask to ensure that the padding is not treated as part of the input
        
        look_ahead_mask : boolean mask for the target_input
        
        padding_mask : boolean mask for the second multihead attention layer

        Return :-

        final_output
        attention_weights - Dictionary of tensors containing all the attention weights for the decoder
                                each of shape Tensor of shape (batch_size, num_heads, target_seq_len, input_seq_len)
        '''
        seq_length = input_sentence.shape[1]
        batch_size = tf.shape(input_sentence)[0]

        # Repeat the positions along the batch dimension
        position_pad_mask = padding_mask(input_sentence)
        position_values = tf.range(tf.shape(input_sentence)[1]) + 1  
        positions = tf.where(position_pad_mask, tf.zeros_like(position_values), tf.cast(position_values, dtype=tf.float32))

        outputs = self.embedding(input_sentence) + self.pos_embedding(positions)

        encoder_output, attention_weights = self.encoder(input_sentence, training, encoder_padding_mask)# (batch_size, inp_seq_len, fully_connected_dim)
        
        #decoder_output, attention_weights = self.decoder(input_sentence, encoder_output, training, look_ahead_mask, decoder_padding_mask)

        #final_output = self.final_layer(decoder_output)
        max_values = tf.math.reduce_max(encoder_output, axis=1)
        indices = tf.argmax(encoder_output, axis=1)

        final_output = self.softmax(self.linear(max_values))

        return final_output, attention_weights
    

    def get_attention_padding_mask(self, q, k, pad_id):
        attn_pad_mask = k.eq(pad_id).unsqueeze(1).repeat(1, q.size(1), 1)
        # |attn_pad_mask| : (batch_size, q_len, k_len)

        return attn_pad_mask

In [56]:
data = pd.read_csv("preprocessed_train.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31246 entries, 0 to 31245
Data columns (total 5 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Unnamed: 0                  31246 non-null  int64 
 1   review_description          31246 non-null  object
 2   rating                      31246 non-null  int64 
 3   preprocessed_review         30897 non-null  object
 4   preprocessed_review_length  31246 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 1.2+ MB


In [57]:
data.head()

Unnamed: 0.1,Unnamed: 0,review_description,rating,preprocessed_review,preprocessed_review_length
0,0,شركه زباله و سواقين بتبرشم و مفيش حتي رقم للشك...,-1,شرك زباله سواقين بتبرشم مفيش حت رقم للشكاوي سو...,130
1,1,خدمة الدفع عن طريق الكي نت توقفت عندي اصبح فقط...,1,خدم دفع طريق كي نت توقف عند صبح فقط دفع نقدا,44
2,2,تطبيق غبي و جاري حذفه ، عاملين اكواد خصم و لما...,-1,تطبيق غب جاري حذف عامل اكواد خصم استخدم اكترى ...,216
3,3,فعلا تطبيق ممتاز بس لو فى امكانية يتيح لمستخدم...,1,علا تطبيق ممتاز امكانيه أتاح مستخدم تطبيق ان ا...,87
4,4,سيء جدا ، اسعار رسوم التوصيل لا تمت للواقع ب ص...,-1,سيء جدا اسعار رسوم توصيل أمات واقع صل,37


In [None]:
from keras.utils import to_categorical
# Drop rows with NaN values in the 'preprocessed_review' column
data = data.dropna(subset=['preprocessed_review'])

# Convert ratings to one-hot encoded labels
labels = to_categorical(data['rating'] + 1)  # Adding 1 to convert -1, 0, 1 to 0, 1, 2

In [60]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['preprocessed_review'])
sequences_arabic = tokenizer.texts_to_sequences(data['preprocessed_review'])
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

7


In [None]:

transformer = Transformer(num_layers = 10, 
                        embedding_dim = 512, 
                        num_heads = 8, 
                        fully_connected_dim = 32, 
                        input_vocab_size = vocab_size, 
                        max_positional_encoding_input = 512)

training = True
encoder_padding_mask = 
model = transformer(sequences_arabic, training, encoder_padding_mask)

In [1]:
import pandas as pd
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
# Load the preprocessed dataset
df = pd.read_csv('preprocessed_train.csv')

# Drop rows with NaN values in the 'preprocessed_review' column
df = df.dropna(subset=['preprocessed_review'])

# Convert ratings to one-hot encoded labels
labels = to_categorical(df['rating'] + 1)  # Adding 1 to convert -1, 0, 1 to 0, 1, 2
print(labels, labels[0])
# Tokenize the Arabic text
tokenizer_arabic = Tokenizer()
tokenizer_arabic.fit_on_texts(df['preprocessed_review'])
sequences_arabic = tokenizer_arabic.texts_to_sequences(df['preprocessed_review'])
max_sequence_length = 100  # Set your desired sequence length
padded_sequences_arabic = pad_sequences(sequences_arabic, maxlen=max_sequence_length)



[[1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 ...
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]] [1. 0. 0.]


In [2]:
print(padded_sequences_arabic.shape)

(30897, 100)


In [28]:
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, max_sequence_length, embedding_dim):
        super(PositionalEncoding, self).__init__()
        self.max_sequence_length = max_sequence_length
        self.embedding_dim = embedding_dim
        self.pos_encoding = self.positional_encoding()

    def get_angles(self, position, i):
        angle_rates = 1 / tf.pow(10000, tf.cast(2 * (i // 2) / self.embedding_dim, dtype=tf.float32))
        return tf.cast(position, dtype=tf.float32) * angle_rates

    def positional_encoding(self):
        angle_rads = self.get_angles(tf.range(self.max_sequence_length)[:, tf.newaxis],
                                     tf.range(self.embedding_dim)[tf.newaxis, :])

        # Apply sine to even indices in the array
        pos_encoding = tf.sin(angle_rads[:, 0::2])

        # Apply cosine to odd indices in the array
        pos_encoding = tf.concat([pos_encoding, tf.cos(angle_rads[:, 1::2])], axis=-1)

        pos_encoding = pos_encoding[tf.newaxis, ...]
        return pos_encoding

    def call(self, inputs):
        inputs = tf.cast(inputs, dtype=tf.float32)
        return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :]


In [29]:
import pandas as pd
from sklearn.metrics import auc
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
import tensorflow as tf
from keras.layers import Input, MultiHeadAttention, LayerNormalization, Dense, Dropout, Embedding, GlobalAveragePooling1D
import tensorflow_addons as tfa
from keras.models import Model
from keras.optimizers import Adam
from keras.metrics import AUC
from sklearn.model_selection import KFold


# Load the preprocessed dataset
df = pd.read_csv('preprocessed_train.csv')

# Drop rows with NaN values in the 'preprocessed_review' column
df = df.dropna(subset=['preprocessed_review'])

# Convert ratings to one-hot encoded labels
labels = to_categorical(df['rating'] + 1)  # Adding 1 to convert -1, 0, 1 to 0, 1, 2
print(labels, labels[0])
# Tokenize the Arabic text
tokenizer_arabic = Tokenizer()
tokenizer_arabic.fit_on_texts(df['preprocessed_review'])
sequences_arabic = tokenizer_arabic.texts_to_sequences(df['preprocessed_review'])
max_sequence_length = 100  # Set your desired sequence length
padded_sequences_arabic = pad_sequences(sequences_arabic, maxlen=max_sequence_length)

def FullyConnected(embedding_dim, fully_connected_dim):
    return tf.keras.Sequential([
        Dense(fully_connected_dim, activation='relu'),  # (batch_size, seq_len, dff)
        Dense(embedding_dim)  # (batch_size, seq_len, d_model)
    ])

def transformer_classifier(max_sequence_length, vocab_size, num_classes):
    # Input for variable-length sequences of integers
    inputs = Input(shape=(max_sequence_length,))
    
    # Embedding layer
    embedding = Embedding(input_dim=vocab_size, output_dim=128 ,input_length=padded_sequences_arabic.shape[1])(inputs)
    positional_encoding = PositionalEncoding(max_sequence_length, 128)(embedding)

    # Transformer layers - You can use TensorFlow's MultiHeadAttention and Transformer layers
    mha_layer = MultiHeadAttention(num_heads=2, key_dim=128)(positional_encoding, positional_encoding)
    norm1_layer = LayerNormalization(epsilon=1e-6)(mha_layer + positional_encoding)
    ffn_layer = FullyConnected(embedding_dim= 128, fully_connected_dim= 32)(norm1_layer)
    ffn_layer = Dropout(0.2)(ffn_layer)
    norm2_layer = LayerNormalization(epsilon=1e-6)(norm1_layer + ffn_layer)
    transformer_layer = GlobalAveragePooling1D()(norm2_layer)
    
    # Dense layers for classification
    dense = Dense(64, activation='relu')(transformer_layer)
    dropout = Dropout(0.5)(dense)
    outputs = Dense(num_classes, activation='softmax')(dropout)
    
    # Create the model
    model = Model(inputs=inputs, outputs=outputs)
    
    return model

# Example usage:
# Assuming max_sequence_length, vocab_size, and num_classes are defined appropriately
max_sequence_length = 100  
vocab_size = len(tokenizer_arabic.word_index) + 1  
num_classes = 3  

# Assuming padded_sequences_arabic and labels are prepared as before
print("Shapes - Padded Sequences:", padded_sequences_arabic.shape, "Labels:", labels.shape)

# Get the vocabulary size
vocab_size = len(tokenizer_arabic.word_index) + 1  # Adding 1 because of reserved 0 index

kf = KFold(n_splits= 2)
fold_no = 1
for train, test in kf.split(padded_sequences_arabic, labels):
    # Create the transformer model for text classification
    model = transformer_classifier(max_sequence_length, vocab_size, num_classes)

    # Compile the model
    optimizer = Adam(learning_rate=1e-4, clipvalue=0.5)
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=[AUC(curve='PR'), 'accuracy'])

# Display the model summary
model.summary()

# Train the model
model.fit(padded_sequences_arabic, labels, epochs=10, batch_size=8, validation_split=0.2)


[[1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 ...
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]] [1. 0. 0.]
Shapes - Padded Sequences: (30897, 100) Labels: (30897, 3)
Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_13 (InputLayer)       [(None, 100)]                0         []                            
                                                                                                  
 embedding_12 (Embedding)    (None, 100, 128)             3003520   ['input_13[0][0]']            
                                                                                                  
 positional_encoding_7 (Pos  (None, 100, 128)             0         ['embedding_12[0][0]']        
 itionalEncoding)                                                                                 
                                           

<keras.src.callbacks.History at 0x21b0e0fd610>

In [30]:
# show the accuracy of the model
loss, accuracy, _ = model.evaluate(padded_sequences_arabic, labels, verbose=False)

print("Training Accuracy: ", accuracy)

Training Accuracy:  0.9590290784835815


In [31]:
import numpy as np
# Load the preprocessed test data
test_df = pd.read_csv('preprocessed_test.csv')

# Drop rows with NaN values in the 'preprocessed_review' column
test_df = test_df.dropna(subset=['preprocessed_review'])

# Tokenize the preprocessed reviews in the test data using the same tokenizer
sequences_test = tokenizer_arabic.texts_to_sequences(test_df['preprocessed_review'])
padded_sequences_test = pad_sequences(sequences_test, maxlen=max_sequence_length)

# Predict using the model on the test data
predictions = model.predict(padded_sequences_test)
predicted_ratings = np.argmax(predictions, axis=1) - 1

# Add predicted ratings as a new column in the test data
test_df['rating'] = predicted_ratings

# Save the test data with predicted ratings as a CSV file
test_df.to_csv('predicted_test_transformer.csv', index=False)




In [32]:
df_lstm = pd.read_csv('predicted_test.csv')
df_transformer = pd.read_csv('predicted_test_transformer.csv')

df = (df_lstm['rating'] == df_transformer['rating'])
print(df.sum())

877
