In [None]:
import numpy as np 
import pickle
import pandas as pd 
import tensorflow as tf

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
# from keras.models import Sequential
# from keras.layers import *
from keras.utils.np_utils import to_categorical
from keras.initializers import Constant
import re
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
gpus = tf.config.experimental.list_physical_devices(device_type='GPU')
tf.config.experimental.set_visible_devices(devices=gpus[0], device_type='GPU')
tf.config.experimental.set_memory_growth(device=gpus[0], enable=True)

In [None]:
from huggingface_hub import login

! git config --global credential.helper store
hf_token='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'
login(token=hf_token, add_to_git_credential=True)

In [None]:
data=pd.read_csv('clean-HARD.csv')
data

In [None]:
data.sentences=data.sentences.astype(str)

In [None]:
sequence_length = 512
max_features = 20000 # this is the number of words we care about 

#fit the tokenizer onto the text.
tokenizer = Tokenizer(num_words=max_features, split=' ', oov_token='<unw>', filters=' ')
tokenizer.fit_on_texts(data['sentences'].values)

# this takes our sentences and replaces each word with an integer
X = tokenizer.texts_to_sequences(data['sentences'].values)

# we then pad the sequences so they're all the same length (sequence_length)
X = pad_sequences(X, sequence_length)

y = pd.get_dummies(data['rating']).values

# lets keep  back 20% of the data for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)  

print("test set size " + str(len(X_test)))


In [None]:
import numpy as np
embeddings_index = {}
f = open('GloVe-Arabic/vectorsHARD.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:] ,dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
num_words = min(max_features, len(word_index)) + 1
print(num_words)

embedding_dim =200

# first create a matrix of zeros, this is our embedding matrix
embedding_matrix = np.zeros((num_words, embedding_dim))

# for each word in out tokenizer lets try to find that work in our w2v model
for word, i in word_index.items():
    if i > max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # we found the word - add that words vector to the matrix
        embedding_matrix[i] = embedding_vector
    else:
        # doesn't exist, assign a random vector
        embedding_matrix[i] = np.random.randn(embedding_dim)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(num_words,
                             embedding_dim,
                             embeddings_initializer=Constant(embedding_matrix),
                             input_length=sequence_length,
                             trainable=False),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(150)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(units=4, activation='softmax')
])

model.compile(loss='categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

model.summary()

In [None]:
tf.keras.utils.plot_model(
    model,
    to_file="biLSTMhard.png",
    show_shapes=True,
    show_dtype=True,
    show_layer_names=True,
    rankdir="TB",
    expand_nested=True,
    dpi=90,
    layer_range=None,
    show_layer_activations=True,
)

In [None]:
batch_size = 128
history = model.fit(X_train, y_train, epochs=15, batch_size=batch_size, verbose=1, validation_split=0.1)

In [None]:
plt.figure(figsize=(10,6))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

plt.figure(figsize=(10,6))
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)

In [None]:
model.save("biLSTMhard.h5")

In [None]:
model.save_pretrained_keras('biLSTMhard')

In [None]:
with open('tokenizerbiLSTMhard.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
from huggingface_hub import push_to_hub_keras
push_to_hub_keras(model, 'biLSTMhard')

In [None]:
from huggingface_hub import from_pretrained_keras

model_hf = from_pretrained_keras('NorahAlshahrani/biLSTMhard')
model_hf.summary()

In [None]:
import pickle
import torch
import numpy as np
import tensorflow as tf
from keras_preprocessing.sequence import pad_sequences

# loaded_model = tf.keras.models.load_model("biLSTMhard.h5")
tokenizer = pickle.load(open('tokenizerbiLSTMhard.pickle', 'rb'))

text = "أنا لا أحب قراءة الكتب"
token = tokenizer.texts_to_sequences([text])
token = pad_sequences(token, maxlen=512)

outputs=model_hf.predict(token)
outputs = torch.from_numpy(outputs)

id2label = {
 0: 'Negative',
 1: 'Negative',
 2: 'Positive',
 3: 'Positive'
}

predClassID= outputs.argmax().item()
pred = outputs.softmax(dim=-1).tolist()
pred = round(np.max(pred)*100, 2)

print(f"Text: '{text}' \nLabel: {id2label[predClassID]} \nPredication: {pred}%")