In [1]:
from sklearn.datasets import fetch_20newsgroups
from keras.layers import  Dropout, Dense
from keras.models import Sequential
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import model_selection, naive_bayes, svm




Using TensorFlow backend.


In [2]:
def TFIDF(X_train, X_test,MAX_NB_WORDS=75000):
    vectorizer_x = TfidfVectorizer(max_features=MAX_NB_WORDS)
    X_train = vectorizer_x.fit_transform(X_train).toarray()
    X_test = vectorizer_x.transform(X_test).toarray()
    print("tf-idf with",str(np.array(X_train).shape[1]),"features")
    return (X_train,X_test)

# Arabic Text classification based Deep Neural Network 

In [3]:
def Build_Model_DNN_Text(shape, nClasses, dropout=0.5):
    """
    buildModel_DNN_Tex(shape, nClasses,dropout)
    Build Deep neural networks Model for text classification
    Shape is input feature space
    nClasses is number of classes
    """
    model = Sequential()
    node = 512 # number of nodes
    nLayers = 4 # number of  hidden layer

    model.add(Dense(node,input_dim=shape,activation='relu'))
    model.add(Dropout(dropout))
    for i in range(0,nLayers):
        model.add(Dense(node,input_dim=node,activation='relu'))
        model.add(Dropout(dropout))
    model.add(Dense(nClasses, activation='softmax'))

    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

# Aljazeera Arabic corpus with 5 classes ( 'Art', 'Economic', 'Politics', 'Science', 'Sport')

In [5]:
Corpus = pd.read_csv(r"aji-Arabic_corpus.csv")

X_train, X_test, y_train, y_test = model_selection.train_test_split(Corpus['text'],Corpus['targe'],test_size=0.2)




In [6]:
X_train_tfidf,X_test_tfidf = TFIDF(X_train,X_test)
model_DNN = Build_Model_DNN_Text(X_train_tfidf.shape[1], 5)
model_DNN.fit(X_train_tfidf, y_train,
                              validation_data=(X_test_tfidf, y_test),
                              epochs=14,
                              batch_size=128,
                              verbose=2)

tf-idf with 45917 features
Train on 1200 samples, validate on 300 samples
Epoch 1/14
 - 19s - loss: 1.6084 - accuracy: 0.2167 - val_loss: 1.5998 - val_accuracy: 0.1933
Epoch 2/14
 - 15s - loss: 1.5774 - accuracy: 0.2225 - val_loss: 1.4649 - val_accuracy: 0.3133
Epoch 3/14
 - 15s - loss: 1.3232 - accuracy: 0.4333 - val_loss: 0.8744 - val_accuracy: 0.7767
Epoch 4/14
 - 15s - loss: 0.5515 - accuracy: 0.8292 - val_loss: 0.2701 - val_accuracy: 0.9100
Epoch 5/14
 - 13s - loss: 0.1694 - accuracy: 0.9467 - val_loss: 0.2870 - val_accuracy: 0.9433
Epoch 6/14
 - 14s - loss: 0.0497 - accuracy: 0.9883 - val_loss: 0.1794 - val_accuracy: 0.9700
Epoch 7/14
 - 32s - loss: 0.0154 - accuracy: 0.9967 - val_loss: 0.2293 - val_accuracy: 0.9567
Epoch 8/14
 - 19s - loss: 0.0140 - accuracy: 0.9967 - val_loss: 0.2060 - val_accuracy: 0.9667
Epoch 9/14
 - 20s - loss: 0.0036 - accuracy: 0.9992 - val_loss: 0.1944 - val_accuracy: 0.9767
Epoch 10/14
 - 13s - loss: 4.3540e-04 - accuracy: 1.0000 - val_loss: 0.2033 - va

<keras.callbacks.callbacks.History at 0x7f80bd422610>

In [7]:
MAX_TEXT_LENGTH=46004
Prediction = np.argmax(model_DNN.predict(X_test_tfidf[:, :MAX_TEXT_LENGTH]),axis=1)

print(metrics.classification_report(y_test, Prediction))

              precision    recall  f1-score   support

           0       1.00      0.95      0.98        62
           1       0.98      0.98      0.98        58
           2       0.94      0.97      0.95        65
           3       0.96      0.98      0.97        56
           4       1.00      1.00      1.00        59

    accuracy                           0.98       300
   macro avg       0.98      0.98      0.98       300
weighted avg       0.98      0.98      0.98       300

