In [4]:
from sklearn.datasets import fetch_20newsgroups
from keras.layers import  Dropout, Dense
from keras.models import Sequential
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn import metrics

In [5]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target

In [6]:
def TFIDF(X_train, X_test,MAX_NB_WORDS=75000):
    vectorizer_x = TfidfVectorizer(max_features=MAX_NB_WORDS)
    X_train = vectorizer_x.fit_transform(X_train).toarray()
    X_test = vectorizer_x.transform(X_test).toarray()
    print("tf-idf with",str(np.array(X_train).shape[1]),"features")
    return (X_train,X_test)


X_train_tfidf, X_test_tfidf = TFIDF(X_train,X_test)

tf-idf with 75000 features


In [7]:
def Build_Model_DNN_Text(shape, nClasses, dropout=0.5):
    """
    buildModel_DNN_Tex(shape, nClasses,dropout)
    Build Deep neural networks Model for text classification
    Shape is input feature space
    nClasses is number of classes
    """
    model = Sequential()
    node = 512 # number of nodes
    nLayers = 4 # number of  hidden layer
    model.add(Dense(node,input_dim=shape,activation='relu'))
    model.add(Dropout(dropout))
    for i in range(0,nLayers):
        model.add(Dense(node,input_dim=node,activation='relu'))
        model.add(Dropout(dropout))
    model.add(Dense(nClasses, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model


model_DNN = Build_Model_DNN_Text(X_train_tfidf.shape[1], 20)

In [8]:
model_DNN.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 512)               38400512  
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 512)               262656    
                                                                 
 dropout_1 (Dropout)         (None, 512)               0         
                                                                 
 dense_2 (Dense)             (None, 512)               262656    
                                                                 
 dropout_2 (Dropout)         (None, 512)               0         
                                                                 
 dense_3 (Dense)             (None, 512)               2

In [9]:
model_DNN.fit(X_train_tfidf, y_train,
                              validation_data=(X_test_tfidf, y_test),
                              epochs=10,
                              batch_size=128,
                              verbose=2)

Epoch 1/10
89/89 - 39s - loss: 2.7524 - accuracy: 0.1137 - val_loss: 1.8437 - val_accuracy: 0.3614 - 39s/epoch - 443ms/step
Epoch 2/10
89/89 - 35s - loss: 1.4153 - accuracy: 0.4815 - val_loss: 1.0858 - val_accuracy: 0.6322 - 35s/epoch - 388ms/step
Epoch 3/10
89/89 - 35s - loss: 0.6719 - accuracy: 0.7561 - val_loss: 0.8082 - val_accuracy: 0.7681 - 35s/epoch - 390ms/step
Epoch 4/10
89/89 - 35s - loss: 0.3183 - accuracy: 0.8958 - val_loss: 0.8342 - val_accuracy: 0.7858 - 35s/epoch - 392ms/step
Epoch 5/10
89/89 - 35s - loss: 0.1588 - accuracy: 0.9495 - val_loss: 0.8589 - val_accuracy: 0.7995 - 35s/epoch - 390ms/step
Epoch 6/10
89/89 - 35s - loss: 0.1225 - accuracy: 0.9648 - val_loss: 0.9616 - val_accuracy: 0.7851 - 35s/epoch - 390ms/step
Epoch 7/10
89/89 - 35s - loss: 0.0888 - accuracy: 0.9739 - val_loss: 0.9427 - val_accuracy: 0.8008 - 35s/epoch - 391ms/step
Epoch 8/10
89/89 - 35s - loss: 0.0616 - accuracy: 0.9831 - val_loss: 1.0256 - val_accuracy: 0.7953 - 35s/epoch - 393ms/step
Epoch 9/

<keras.callbacks.History at 0x7f467d508910>

In [10]:
predicted = model_DNN.predict(X_test_tfidf).argmax(axis=1)
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.79      0.76      0.77       319
           1       0.61      0.74      0.67       389
           2       0.70      0.68      0.69       394
           3       0.53      0.83      0.64       392
           4       0.83      0.70      0.76       385
           5       0.82      0.74      0.78       395
           6       0.75      0.85      0.80       390
           7       0.89      0.84      0.86       396
           8       0.95      0.93      0.94       398
           9       0.98      0.82      0.90       397
          10       0.98      0.95      0.96       399
          11       0.94      0.89      0.91       396
          12       0.71      0.67      0.69       393
          13       0.88      0.73      0.80       396
          14       0.90      0.88      0.89       394
          15       0.78      0.91      0.84       398
          16       0.73      0.88      0.80       364
          17       0.99    