In [1]:
from keras.layers import Dropout, Dense, GRU, Embedding,LSTM
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import GRU
from keras.layers import Conv1D, MaxPooling1D
from keras.datasets import imdb
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import model_selection, naive_bayes, svm
from sklearn import metrics
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


# Convert text to word embedding (Using Google Vector 50 Dim):

In [2]:
def loadData_Tokenizer(X_train, X_test,MAX_NB_WORDS=75000,MAX_SEQUENCE_LENGTH=500):
    np.random.seed(7)
    text = np.concatenate((X_train, X_test), axis=0)
    text = np.array(text)
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(text)
    sequences = tokenizer.texts_to_sequences(text)
    word_index = tokenizer.word_index
    text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    print('Found %s unique tokens.' % len(word_index))
    indices = np.arange(text.shape[0])
    # np.random.shuffle(indices)
    text = text[indices]
    print(text.shape)
    X_train = text[0:len(X_train), ]
    X_test = text[len(X_train):, ]
    embeddings_index = {}
    f = open("cc.ar.50.vec", encoding="utf8")
    for line in f:
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float32')
        except:
            pass
        embeddings_index[word] = coefs
    f.close()
    print('Total %s word vectors.' % len(embeddings_index))
    return (X_train, X_test, word_index,embeddings_index)

# Arabic Text Classification based RCNN

In [3]:
def Build_Model_RCNN_Text(word_index, embeddings_index, nclasses, MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=50):

    kernel_size = 2
    filters = 256
    pool_size = 2
    gru_node = 256

    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            if len(embedding_matrix[i]) !=len(embedding_vector):
                print("could not broadcast input array from shape",str(len(embedding_matrix[i])),
                                 "into shape",str(len(embedding_vector))," Please make sure your"
                                 " EMBEDDING_DIM is equal to embedding_vector file ,GloVe,")
                exit(1)

            embedding_matrix[i] = embedding_vector



    model = Sequential()
    model.add(Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True))
    model.add(Dropout(0.25))
    model.add(Conv1D(filters, kernel_size, activation='relu'))
    model.add(MaxPooling1D(pool_size=pool_size))
    model.add(Conv1D(filters, kernel_size, activation='relu'))
    model.add(MaxPooling1D(pool_size=pool_size))
    model.add(Conv1D(filters, kernel_size, activation='relu'))
    model.add(MaxPooling1D(pool_size=pool_size))
    model.add(Conv1D(filters, kernel_size, activation='relu'))
    model.add(MaxPooling1D(pool_size=pool_size))
    model.add(LSTM(gru_node, return_sequences=True, recurrent_dropout=0.2))
    model.add(LSTM(gru_node, return_sequences=True, recurrent_dropout=0.2))
    model.add(LSTM(gru_node, return_sequences=True, recurrent_dropout=0.2))
    
    model.add(LSTM(gru_node, recurrent_dropout=0.2))
    
    
    model.add(Dense(512,activation='relu'))
    model.add(Dense(nclasses))
    model.add(Activation('softmax'))

    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [4]:
Corpus = pd.read_csv(r"aji-Arabic_corpus.csv")





In [5]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(Corpus['text'],Corpus['targe'],test_size=0.2)


In [6]:
X_train_Glove,X_test_Glove, word_index,embeddings_index = loadData_Tokenizer(X_train,X_test)



Found 54193 unique tokens.
(1500, 500)
Total 1999990 word vectors.


In [9]:
model_RCNN = Build_Model_RCNN_Text(word_index,embeddings_index, 5)


model_RCNN.summary()

model_RCNN.fit(X_train_Glove, y_train,
                              validation_data=(X_test_Glove, y_test),
                              epochs=30,
                              batch_size=128,
                              verbose=2)

predicted = model_RCNN.predict(X_test_Glove)

predicted = np.argmax(predicted, axis=1)
target_names = ['Art', 'Economic', 'Politics', 'Science', 'Sport']
report=metrics.classification_report(y_test, predicted, target_names=target_names, digits=4)
print(report)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 50)           2709700   
_________________________________________________________________
dropout_2 (Dropout)          (None, 500, 50)           0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 499, 256)          25856     
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 249, 256)          0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 248, 256)          131328    
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 124, 256)          0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 123, 256)         

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1200 samples, validate on 300 samples
Epoch 1/30
 - 69s - loss: 1.6120 - accuracy: 0.2042 - val_loss: 1.6036 - val_accuracy: 0.2200
Epoch 2/30
 - 50s - loss: 1.5878 - accuracy: 0.2567 - val_loss: 1.5787 - val_accuracy: 0.2400
Epoch 3/30
 - 49s - loss: 1.4506 - accuracy: 0.3467 - val_loss: 1.2496 - val_accuracy: 0.3667
Epoch 4/30
 - 50s - loss: 1.1001 - accuracy: 0.4625 - val_loss: 1.1508 - val_accuracy: 0.4167
Epoch 5/30
 - 50s - loss: 0.9199 - accuracy: 0.5442 - val_loss: 0.8814 - val_accuracy: 0.5600
Epoch 6/30
 - 50s - loss: 0.8560 - accuracy: 0.6125 - val_loss: 0.8869 - val_accuracy: 0.5667
Epoch 7/30
 - 49s - loss: 0.7412 - accuracy: 0.6417 - val_loss: 0.8345 - val_accuracy: 0.5467
Epoch 8/30
 - 49s - loss: 0.6064 - accuracy: 0.7000 - val_loss: 0.7320 - val_accuracy: 0.6033
Epoch 9/30
 - 49s - loss: 0.5393 - accuracy: 0.7375 - val_loss: 0.6703 - val_accuracy: 0.6900
Epoch 10/30
 - 48s - loss: 0.3958 - accuracy: 0.8033 - val_loss: 0.5519 - val_accuracy: 0.7867
Epoch 11/30
