In [1]:
from keras.layers import Dropout, Dense, GRU, Embedding,LSTM
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import GRU
from keras.layers import Conv1D, MaxPooling1D
from keras.datasets import imdb
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import model_selection, naive_bayes, svm
from sklearn import metrics
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


# Convert text to word embedding (Using Google Vector 50 Dim):

In [2]:
def loadData_Tokenizer(X_train, X_test,MAX_NB_WORDS=75000,MAX_SEQUENCE_LENGTH=500):
    np.random.seed(7)
    text = np.concatenate((X_train, X_test), axis=0)
    text = np.array(text)
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(text)
    sequences = tokenizer.texts_to_sequences(text)
    word_index = tokenizer.word_index
    text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    print('Found %s unique tokens.' % len(word_index))
    indices = np.arange(text.shape[0])
    # np.random.shuffle(indices)
    text = text[indices]
    print(text.shape)
    X_train = text[0:len(X_train), ]
    X_test = text[len(X_train):, ]
    embeddings_index = {}
    f = open("cc.ar.50.vec", encoding="utf8")
    for line in f:
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float32')
        except:
            pass
        embeddings_index[word] = coefs
    f.close()
    print('Total %s word vectors.' % len(embeddings_index))
    return (X_train, X_test, word_index,embeddings_index)

# Arabic Text Classification based RCNN

In [3]:
def Build_Model_RCNN_Text(word_index, embeddings_index, nclasses, MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=50):

    kernel_size = 2
    filters = 256
    pool_size = 2
    gru_node = 256

    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            if len(embedding_matrix[i]) !=len(embedding_vector):
                print("could not broadcast input array from shape",str(len(embedding_matrix[i])),
                                 "into shape",str(len(embedding_vector))," Please make sure your"
                                 " EMBEDDING_DIM is equal to embedding_vector file ,GloVe,")
                exit(1)

            embedding_matrix[i] = embedding_vector



    model = Sequential()
    model.add(Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True))
    model.add(Dropout(0.25))
    model.add(Conv1D(filters, kernel_size, activation='relu'))
    model.add(MaxPooling1D(pool_size=pool_size))
    model.add(Conv1D(filters, kernel_size, activation='relu'))
    model.add(MaxPooling1D(pool_size=pool_size))
    model.add(Conv1D(filters, kernel_size, activation='relu'))
    model.add(MaxPooling1D(pool_size=pool_size))
    model.add(Conv1D(filters, kernel_size, activation='relu'))
    model.add(MaxPooling1D(pool_size=pool_size))
    model.add(LSTM(gru_node, return_sequences=True, recurrent_dropout=0.2))
    model.add(LSTM(gru_node, return_sequences=True, recurrent_dropout=0.2))
    model.add(LSTM(gru_node, return_sequences=True, recurrent_dropout=0.2))
    
    model.add(LSTM(gru_node, recurrent_dropout=0.2))
    
    
    model.add(Dense(512,activation='relu'))
    model.add(Dense(nclasses))
    model.add(Activation('softmax'))

    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [4]:
Corpus = pd.read_csv(r"aji-Arabic_corpus.csv")





In [5]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(Corpus['text'],Corpus['targe'],test_size=0.2)


In [6]:
X_train_Glove,X_test_Glove, word_index,embeddings_index = loadData_Tokenizer(X_train,X_test)



Found 54193 unique tokens.
(1500, 500)
Total 1999990 word vectors.


In [8]:
model_RCNN = Build_Model_RCNN_Text(word_index,embeddings_index, 5)


model_RCNN.summary()

model_RCNN.fit(X_train_Glove, y_train,
                              validation_data=(X_test_Glove, y_test),
                              epochs=15,
                              batch_size=128,
                              verbose=2)

predicted = model_RCNN.predict(X_test_Glove)

predicted = np.argmax(predicted, axis=1)
print(metrics.classification_report(y_test, predicted))

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 50)           2709700   
_________________________________________________________________
dropout_2 (Dropout)          (None, 500, 50)           0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 499, 256)          25856     
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 249, 256)          0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 248, 256)          131328    
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 124, 256)          0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 123, 256)         

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1200 samples, validate on 300 samples
Epoch 1/15
 - 111s - loss: 1.6106 - accuracy: 0.2058 - val_loss: 1.6136 - val_accuracy: 0.2333
Epoch 2/15
 - 59s - loss: 1.5923 - accuracy: 0.2475 - val_loss: 1.5221 - val_accuracy: 0.2867
Epoch 3/15
 - 65s - loss: 1.3526 - accuracy: 0.3625 - val_loss: 1.1222 - val_accuracy: 0.4300
Epoch 4/15
 - 69s - loss: 1.2235 - accuracy: 0.4375 - val_loss: 1.1205 - val_accuracy: 0.4633
Epoch 5/15
 - 60s - loss: 0.9173 - accuracy: 0.5483 - val_loss: 0.8999 - val_accuracy: 0.5433
Epoch 6/15
 - 61s - loss: 0.8031 - accuracy: 0.5600 - val_loss: 0.9647 - val_accuracy: 0.5100
Epoch 7/15
 - 56s - loss: 0.6423 - accuracy: 0.5683 - val_loss: 0.6944 - val_accuracy: 0.6067
Epoch 8/15
 - 56s - loss: 0.5825 - accuracy: 0.6425 - val_loss: 0.7817 - val_accuracy: 0.6033
Epoch 9/15
 - 72s - loss: 0.5530 - accuracy: 0.6708 - val_loss: 0.7706 - val_accuracy: 0.6600
Epoch 10/15
 - 62s - loss: 0.4527 - accuracy: 0.7542 - val_loss: 1.0140 - val_accuracy: 0.6300
Epoch 11/15

# CoreNLP, spaCy, Flair, huggingface

In [None]:
model_RCNN = Build_Model_RCNN_Text(word_index,embeddings_index, 5)


model_RCNN.summary()

model_RCNN.fit(X_train_Glove, y_train,
                              validation_data=(X_test_Glove, y_test),
                              epochs=30,
                              batch_size=128,
                              verbose=2)

predicted = model_RCNN.predict(X_test_Glove)

predicted = np.argmax(predicted, axis=1)
print(metrics.classification_report(y_test, predicted))

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 500, 50)           2709700   
_________________________________________________________________
dropout_3 (Dropout)          (None, 500, 50)           0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 499, 256)          25856     
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 249, 256)          0         
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 248, 256)          131328    
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 124, 256)          0         
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 123, 256)         

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1200 samples, validate on 300 samples
Epoch 1/30
 - 81s - loss: 1.6157 - accuracy: 0.1950 - val_loss: 1.6107 - val_accuracy: 0.1800
Epoch 2/30
 - 62s - loss: 1.6074 - accuracy: 0.2092 - val_loss: 1.5848 - val_accuracy: 0.3067
Epoch 3/30
 - 56s - loss: 1.4853 - accuracy: 0.3092 - val_loss: 1.2407 - val_accuracy: 0.4000
Epoch 4/30
 - 66s - loss: 1.2244 - accuracy: 0.4092 - val_loss: 1.1685 - val_accuracy: 0.4167
Epoch 5/30
 - 57s - loss: 1.0457 - accuracy: 0.4892 - val_loss: 1.3372 - val_accuracy: 0.4167
Epoch 6/30
 - 58s - loss: 0.9818 - accuracy: 0.5533 - val_loss: 0.7348 - val_accuracy: 0.6233
Epoch 7/30
 - 59s - loss: 0.6671 - accuracy: 0.6283 - val_loss: 0.6426 - val_accuracy: 0.6900
Epoch 8/30
 - 57s - loss: 0.4231 - accuracy: 0.8125 - val_loss: 0.6118 - val_accuracy: 0.7567
Epoch 9/30
 - 507s - loss: 0.2280 - accuracy: 0.9125 - val_loss: 0.7626 - val_accuracy: 0.7933
Epoch 10/30
 - 69s - loss: 0.2247 - accuracy: 0.9125 - val_loss: 0.7276 - val_accuracy: 0.7833
Epoch 11/30