In [1]:
import pandas as pd

In [2]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

Using TensorFlow backend.


In [98]:
hm = {'r':0,'p':0,'v':0}
def label_race(row):
    if row['event_type'].strip() == "Riots" :
        hm['r'] += 1
        return 0
    if row['event_type'].strip() == "Protests" :
        hm['p'] += 1
        return 1
    if row['event_type'].strip() == "Violence against civilians":
        hm['v'] += 1
        return 2
    return 3

In [99]:
pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = textblob.TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

In [100]:
df = pd.read_csv("dataset/acled.csv")

In [101]:
df['label'] = df.apply (lambda row: label_race(row), axis=1)
#df['label'] = df['event_type'].factorize()[0]
print(hm)

{'r': 546, 'p': 2652, 'v': 139}


In [93]:
columns = ['notes', 'label']
df = df[columns]
df.columns = ['notes', 'label']
t = {'notes':["On 4 Mar, in a coastal area of Tanur (Malappuram, Kerala), a former secretary and workers of DYFI's area committee (CPM) were seriously injured in an attack by unidentified assailants. The incident is believed to be a continuation of political clash between the CPM and the IUML workers in the region."],'label':[2]}
test = pd.DataFrame(data=t)
df = df[:-1]

In [94]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(df['notes'], df['label'])
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [95]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
test_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(df['notes'])
test_vect.fit(test['notes'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)
test_valid = test_vect.transform(test['notes'])

In [65]:
df['noun_count'] = df['notes'].apply(lambda x: check_pos_tag(x, 'noun'))
df['verb_count'] = df['notes'].apply(lambda x: check_pos_tag(x, 'verb'))
df['adj_count'] = df['notes'].apply(lambda x: check_pos_tag(x, 'adj'))
df['adv_count'] = df['notes'].apply(lambda x: check_pos_tag(x, 'adv'))
df['pron_count'] = df['notes'].apply(lambda x: check_pos_tag(x, 'pron'))

In [66]:
df.head()

Unnamed: 0,notes,label,noun_count,verb_count,adj_count,adv_count,pron_count
0,"On 23 March, nearly 3,000 Delhi-Okhla resident...",1,13,3,2,3,0
1,"On 23 March, to groups of villagers - from Bar...",0,22,6,4,2,3
2,"Assumed on 23 March, two groups of Congress wo...",0,17,2,2,0,0
3,"Between 22-23 Mar, in Bengaluru (Bengaluru Urb...",0,17,5,4,0,3
4,"On March 23, villagers vandalized the house of...",0,21,3,3,1,2


In [96]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, t):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    print(feature_vector_train.shape)
    # predict the labels on validation dataset
    print(feature_vector_valid.shape)
    print(t.shape)
    predictions = classifier.predict(feature_vector_valid)
    #p = classifier.predict(t)
    print(predictions)
    return metrics.accuracy_score(predictions, valid_y)

In [97]:
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf, test_valid)
print ("NB, WordLevel TF-IDF: ", accuracy)

(2498, 5000)
(833, 5000)
(1, 39)
[1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1
 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 0 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 1 1 1 0 1 1 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 