In [2]:
from keras.layers import Dropout, Dense, GRU, Embedding
from keras.models import Sequential
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn import metrics
from keras.preprocessing.text import Tokenizer
from keras.utils.data_utils import pad_sequences
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import model_selection, naive_bayes, svm

In [3]:
def loadData_Tokenizer(X_train, X_test,MAX_NB_WORDS=75000,MAX_SEQUENCE_LENGTH=500):
    np.random.seed(7)
    text = np.concatenate((X_train, X_test), axis=0)
    text = np.array(text)
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(text)
    sequences = tokenizer.texts_to_sequences(text)
    word_index = tokenizer.word_index
    text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    print('Found %s unique tokens.' % len(word_index))
    indices = np.arange(text.shape[0])
    # np.random.shuffle(indices)
    text = text[indices]
    print(text.shape)
    X_train = text[0:len(X_train), ]
    X_test = text[len(X_train):, ]
    embeddings_index = {}
    f = open("glove.6B.50d.txt", encoding="utf8")
    for line in f:

        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float32')
        except:
            pass
        embeddings_index[word] = coefs
    f.close()
    print('Total %s word vectors.' % len(embeddings_index))
    return (X_train, X_test, word_index,embeddings_index)

# Violence Classification based on RNN Algorithm

In [10]:
def Build_Model_RNN_Text(word_index, embeddings_index, nclasses,  MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=50, dropout=0.5):
    """
    def buildModel_RNN(word_index, embeddings_index, nclasses,  MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=50, dropout=0.5):
    word_index in word index ,
    embeddings_index is embeddings index, look at data_helper.py
    nClasses is number of classes,
    MAX_SEQUENCE_LENGTH is maximum lenght of text sequences
    """

    model = Sequential()
    hidden_layer = 3
    gru_node = 32

    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            if len(embedding_matrix[i]) != len(embedding_vector):
                print("could not broadcast input array from shape", str(len(embedding_matrix[i])),
                      "into shape", str(len(embedding_vector)), " Please make sure your"
                                                                " EMBEDDING_DIM is equal to embedding_vector file ,GloVe,")
                exit(1)
            embedding_matrix[i] = embedding_vector
    model.add(Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True))


    print(gru_node)
    for i in range(0,hidden_layer):
        model.add(GRU(gru_node,return_sequences=True, recurrent_dropout=0.2))
        model.add(Dropout(dropout))
    model.add(GRU(gru_node, recurrent_dropout=0.2))
    model.add(Dropout(dropout))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(5, activation='softmax'))


    model.compile(loss='sparse_categorical_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
    return model

# Dataset for Violence in Yemens Social Media

In [5]:
import nltk 
words = set(nltk.corpus.words.words())
Corpus = pd.read_csv(r"Violence in Yemens Social Media.csv", sep=';')
list_term=[]
for term in Corpus.Text:
    sent = " ".join(w for w in nltk.wordpunct_tokenize(term) if w.lower() in words or not w.isalpha())
    list_term.append(sent)
dic={'Text':list_term, 'Dominant_Topic':Corpus.Dominant_Topic}
Corpus=pd.DataFrame(dic)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [7]:

X_train, X_test, y_train, y_test = model_selection.train_test_split(Corpus['Text'],Corpus['Dominant_Topic'],test_size=0.2)

In [8]:
X_train_Glove,X_test_Glove, word_index,embeddings_index = loadData_Tokenizer(X_train,X_test)

Found 7163 unique tokens.
(77062, 500)
Total 400000 word vectors.


In [12]:
model_RNN = Build_Model_RNN_Text(word_index,embeddings_index, 5,500)
model_RNN.fit(X_train_Glove, y_train,
                              validation_data=(X_test_Glove, y_test),
                              epochs=15,
                              batch_size=128,
                              verbose=2)

32
Epoch 1/15
482/482 - 823s - loss: 0.7313 - accuracy: 0.7500 - val_loss: 0.3003 - val_accuracy: 0.9107 - 823s/epoch - 2s/step
Epoch 2/15
482/482 - 767s - loss: 0.2833 - accuracy: 0.9217 - val_loss: 0.2169 - val_accuracy: 0.9391 - 767s/epoch - 2s/step
Epoch 3/15
482/482 - 840s - loss: 0.2040 - accuracy: 0.9446 - val_loss: 0.1989 - val_accuracy: 0.9485 - 840s/epoch - 2s/step
Epoch 4/15
482/482 - 1647s - loss: 0.1640 - accuracy: 0.9558 - val_loss: 0.1685 - val_accuracy: 0.9579 - 1647s/epoch - 3s/step
Epoch 5/15
482/482 - 1012s - loss: 0.1404 - accuracy: 0.9635 - val_loss: 0.1658 - val_accuracy: 0.9613 - 1012s/epoch - 2s/step
Epoch 6/15
482/482 - 736s - loss: 0.1234 - accuracy: 0.9676 - val_loss: 0.1532 - val_accuracy: 0.9636 - 736s/epoch - 2s/step
Epoch 7/15
482/482 - 1071s - loss: 0.1106 - accuracy: 0.9709 - val_loss: 0.1697 - val_accuracy: 0.9632 - 1071s/epoch - 2s/step
Epoch 8/15
482/482 - 1206s - loss: 0.1011 - accuracy: 0.9740 - val_loss: 0.1640 - val_accuracy: 0.9661 - 1206s/epoch

<keras.callbacks.History at 0x7f88bc469c10>

In [13]:
term_topic0=['violence', 'organization', 'government', 'practice', 'objective', 'social', 'religious', 'state', 'hundred', 'unlawful']
term_topic1=['terrorist', 'militia', 'people', 'yemeni', 'houthi', 'year', 'eman', 'girl', 'village', 'support']
term_topic3=['mine', 'international', 'home', 'plant', 'city', 'displace', 'terror', 'blow', 'threat', 'face']
term_topic4=['group', 'crime', 'life', 'take', 'racist', 'believe', 'call', 'control', 'send', 'carry']
term_topic5=['child', 'kill', 'recruit', 'political', 'houthis', 'civilian', 'war', 'woman', 'force', 'taiz']

In [14]:
Topic0="Religious organization violence"
Topic1="Houthi militias terrorize Yemeni people"
Topic3="international peace threats"
Topic4="Racist Tendencies"
Topic5="Recruitment and killing of children"

In [15]:
predicted = model_RNN.predict(X_test_Glove)
predicted = np.argmax(predicted, axis=1)



In [16]:
from sklearn.metrics import classification_report

target_names = [Topic0, Topic1, Topic3, Topic4, Topic5]
report=classification_report(y_test, predicted, target_names=target_names, digits=2)
print(report)

                                         precision    recall  f1-score   support

        Religious organization violence       0.97      0.96      0.97      1288
Houthi militias terrorize Yemeni people       0.98      0.98      0.98      8455
            international peace threats       0.94      0.96      0.95      1110
                      Racist Tendencies       0.97      0.96      0.97      2028
    Recruitment and killing of children       0.95      0.95      0.95      2532

                               accuracy                           0.97     15413
                              macro avg       0.96      0.96      0.96     15413
                           weighted avg       0.97      0.97      0.97     15413

