In [37]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas as pd, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

In [38]:
import pandas as pd
import numpy as np
import pickle
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

In [91]:
def relabel(row):
    if row['event_type'].strip() == "Riots" :
        return 0
    if row['event_type'].strip() == "Protests" :
        return 1
    if row['event_type'].strip() == "Violence against civilians":
        return 2
    return 3

In [92]:
prot = pd.read_csv("dataset/protest.csv")
riot = pd.read_csv("dataset/riots.csv")
others = pd.read_csv("dataset/others.csv")
df = pd.read_csv("dataset/violence.csv")
df = df.append(prot, ignore_index = True)
df = df.append(riot, ignore_index = True)
df = df.append(others, ignore_index = True)
df['label'] = df.apply (lambda row: relabel(row), axis=1)
columns = ['notes', 'label']
df = df[columns]
df.columns = ['notes', 'label']
df = df.reindex(np.random.permutation(df.index))
len(df)
df.head()

Unnamed: 0,notes,label
5496,"On 15 Dec, BNP candidate Jalal Uddin and nine ...",0
419,"On January 18, JNIM and/or ISGS militants atta...",2
1135,A soldier of the 17th Infantry Brigade was rep...,2
1219,December 23. English speaking separatists atta...,2
1489,December 15. Woman assaulted at the Mahamasina...,2


In [93]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(df['notes'], df['label'])
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)
hm = {'0':0,'1':0,'2':0,'3':0}
for d in valid_y:
    if d == 0:
        hm['0'] += 1
    elif d == 1:
        hm['1'] += 1
    elif d == 2:
        hm['2'] += 1
    else:
        hm['3'] += 1
print(hm)

{'0': 505, '1': 490, '2': 499, '3': 506}


In [94]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(df['notes'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

In [95]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', max_features=5000, token_pattern=r'\w{1,}')#'
tfidf_vect.fit(df['notes'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', ngram_range=(2,3), max_features=5000, token_pattern=r'\w{1,}')
tfidf_vect_ngram.fit(df['notes'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', ngram_range=(2,3), max_features=5000, token_pattern=r'\w{1,}')
tfidf_vect_ngram_chars.fit(df['notes'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 

In [44]:
df['char_count'] = df['notes'].apply(len)
df['word_count'] = df['notes'].apply(lambda x: len(x.split()))
df['word_density'] = df['char_count'] / (df['word_count']+1)
df['punctuation_count'] = df['notes'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
df['title_word_count'] = df['notes'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
df['upper_case_word_count'] = df['notes'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [45]:
pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

# function to check and get the part of speech tag count of a words in a given sentence
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = textblob.TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

df['noun_count'] = df['notes'].apply(lambda x: check_pos_tag(x, 'noun'))
df['verb_count'] = df['notes'].apply(lambda x: check_pos_tag(x, 'verb'))
df['adj_count'] = df['notes'].apply(lambda x: check_pos_tag(x, 'adj'))
df['adv_count'] = df['notes'].apply(lambda x: check_pos_tag(x, 'adv'))
df['pron_count'] = df['notes'].apply(lambda x: check_pos_tag(x, 'pron'))

In [46]:
# train a LDA Model
lda_model = decomposition.LatentDirichletAllocation(n_components=20, learning_method='online', max_iter=20)
X_topics = lda_model.fit_transform(xtrain_count)
topic_word = lda_model.components_ 
vocab = count_vect.get_feature_names()

# view the topic models
n_top_words = 10
topic_summaries = []
for i, topic_dist in enumerate(topic_word):
    topic_words = numpy.array(vocab)[numpy.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))

In [96]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    print(predictions)
    predictions[0] = 0
    return metrics.accuracy_score(predictions, valid_y), 3#metrics.log_loss(predictions, valid_y)

In [97]:
# Naive Bayes on Count Vectors
accuracy, loss = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
print("NB, Count Vectors: ", accuracy, loss)

# Naive Bayes on Word Level TF IDF Vectors
accuracy, loss = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("NB, WordLevel TF-IDF: ", accuracy, loss)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy, loss = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("NB, N-Gram Vectors: ", accuracy, loss)

# Naive Bayes on Character Level TF IDF Vectors
accuracy, loss = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print ("NB, CharLevel Vectors: ", accuracy, loss)

[3 3 3 ... 3 3 2]
NB, Count Vectors:  0.8865 3
[3 3 3 ... 3 3 2]
NB, WordLevel TF-IDF:  0.884 3
[3 3 3 ... 3 3 2]
NB, N-Gram Vectors:  0.873 3
[3 3 3 ... 3 3 2]
NB, CharLevel Vectors:  0.8485 3


In [26]:
# Linear Classifier on Count Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count)
print("LR, Count Vectors: ", accuracy)

# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)
print("LR, WordLevel TF-IDF: ", accuracy)

# Linear Classifier on Ngram Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("LR, N-Gram Vectors: ", accuracy)

# Linear Classifier on Character Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("LR, CharLevel Vectors: ", accuracy)



LR, Count Vectors:  0.9366666666666666
LR, WordLevel TF-IDF:  0.9206666666666666
LR, N-Gram Vectors:  0.9053333333333333
LR, CharLevel Vectors:  0.914


In [19]:
# SVM on Ngram Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("SVM, Vectors: ", accuracy)
# SVM on Ngram Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("SVM, N-Gram Vectors: ", accuracy)

SVM, Vectors:  0.314
SVM, N-Gram Vectors:  0.314


In [21]:
# RF on Count Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xvalid_count)
print ("RF, Count Vectors: ", accuracy)

# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("RF, WordLevel TF-IDF: ", accuracy)



RF, Count Vectors:  0.886




RF, WordLevel TF-IDF:  0.876


In [22]:
# Extereme Gradient Boosting on Count Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())
print ("Xgb, Count Vectors: ", accuracy)

# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc())
print ("Xgb, WordLevel TF-IDF: ", accuracy)


Xgb, Count Vectors:  0.8886666666666667
Xgb, WordLevel TF-IDF:  0.8866666666666667
