In [2]:
import pandas as pd
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
import tensorflow as tf
from keras.layers import Input, MultiHeadAttention, LayerNormalization, Dense, Dropout, Embedding, GlobalAveragePooling1D
from keras.models import Model
from keras.optimizers import Adam

# Load the preprocessed dataset
df = pd.read_csv('preprocessed_train.csv')

# Drop rows with NaN values in the 'preprocessed_review' column
df = df.dropna(subset=['preprocessed_review'])

# Convert ratings to one-hot encoded labels
labels = to_categorical(df['rating'] + 1)  # Adding 1 to convert -1, 0, 1 to 0, 1, 2
print(labels, labels[0])
# Tokenize the Arabic text
tokenizer_arabic = Tokenizer()
tokenizer_arabic.fit_on_texts(df['preprocessed_review'])
sequences_arabic = tokenizer_arabic.texts_to_sequences(df['preprocessed_review'])
max_sequence_length = 100  # Set your desired sequence length
padded_sequences_arabic = pad_sequences(sequences_arabic, maxlen=max_sequence_length)


def transformer_classifier(max_sequence_length, vocab_size, num_classes):
    # Input for variable-length sequences of integers
    inputs = Input(shape=(max_sequence_length,))
    
    # Embedding layer
    embedding = Embedding(input_dim=vocab_size, output_dim=128)(inputs)
    
    # Transformer layers - You can use TensorFlow's MultiHeadAttention and Transformer layers
    transformer_layer = MultiHeadAttention(num_heads=2, key_dim=128)(embedding, embedding)
    transformer_layer = Dropout(0.2)(transformer_layer)
    transformer_layer = LayerNormalization(epsilon=1e-6)(transformer_layer)
    transformer_layer = GlobalAveragePooling1D()(transformer_layer)
    
    # Dense layers for classification
    dense = Dense(64, activation='relu')(transformer_layer)
    dropout = Dropout(0.5)(dense)
    outputs = Dense(num_classes, activation='softmax')(dropout)
    
    # Create the model
    model = Model(inputs=inputs, outputs=outputs)
    
    return model

# Example usage:
# Assuming max_sequence_length, vocab_size, and num_classes are defined appropriately
max_sequence_length = 100  # Example sequence length
vocab_size = 10000  # Example vocabulary size
num_classes = 3  # Example number of classes

# Assuming padded_sequences_arabic and labels are prepared as before
print("Shapes - Padded Sequences:", padded_sequences_arabic.shape, "Labels:", labels.shape)

# Get the vocabulary size
vocab_size = len(tokenizer_arabic.word_index) + 1  # Adding 1 because of reserved 0 index

# Create the transformer model for text classification
model = transformer_classifier(max_sequence_length, vocab_size, num_classes)

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])

# Display the model summary
model.summary()

# Train the model
model.fit(padded_sequences_arabic, labels, epochs=5, batch_size=32, validation_split=0.2)

[[1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 ...
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]] [1. 0. 0.]
Shapes - Padded Sequences: (30897, 100) Labels: (30897, 3)
Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 100)]                0         []                            
                                                                                                  
 embedding_1 (Embedding)     (None, 100, 128)             3003520   ['input_2[0][0]']             
                                                                                                  
 multi_head_attention_1 (Mu  (None, 100, 128)             131968    ['embedding_1[0][0]',         
 ltiHeadAttention)                                                   'embedding_1[0][0]']         
                                           

<keras.src.callbacks.History at 0x7fc5b9fa95a0>