In [116]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
import seaborn as sns

In [117]:
def relabel(row):
    if row['event_type'].strip() == "Riots" :
        return 0
    if row['event_type'].strip() == "Protests" :
        return 1
    if row['event_type'].strip() == "Violence against civilians":
        return 2
    return 3

In [118]:
def stemSentence(row):
    sentence = row['notes']
    words = word_tokenize(sentence)
    final = ""
    sb = ss("english")
    for w in words:
        if w not in stopwords.words('english'):
            root = sb.stem(w)
            final += root
            final += " "
    return final

In [119]:
def stem(sentence):
    words = word_tokenize(sentence)
    final = ""
    sb = ss("english")
    for w in words:
        if w not in stopwords.words('english'):
            root = sb.stem(w)
            final += root
            final += " "
    return final

In [120]:
prot = pd.read_csv("dataset/protest.csv")
riot = pd.read_csv("dataset/riots.csv")
others = pd.read_csv("dataset/others.csv")
df = pd.read_csv("dataset/violence.csv")
df = df.append(prot, ignore_index = True)
df = df.append(riot, ignore_index = True)
df = df.append(others, ignore_index = True)
df['label'] = df.apply (lambda row: relabel(row), axis=1)
df['notes'] = df.apply (lambda row: stemSentence(row), axis=1)
columns = ['notes', 'label']
df = df[columns]
df.columns = ['notes', 'label']
# df = df.reindex(np.random.permutation(df.index))
# len(df)
df.head()

Unnamed: 0,notes,label
0,31 januari . the nyatura fdp ( niyonzimana fac...,2
1,"on 31 januari , 2019 , algerian naval forc sho...",2
2,"on januari 31 , presum jnim and/or isg milit a...",2
3,"on januari 31 , armi camp sentinel gorom-gorom...",2
4,"on januari 31 , presum jnim and/or isg milit w...",2


In [54]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer as ss
ek = ['elect', 'poll', 'ballot', 'by-elect', 'vote', 'soapbox', 'teller']
elects = 0
for d in df['notes'].iteritems():
    stems = stemSentence(d[1])
    words = word_tokenize(stems)
    for e in ek:
        if e in words:
            elects += 1
            break
print(elects, len(df['notes']))

392 7997


In [142]:
df.to_csv("train_data.csv", sep=',', encoding='utf-8')
print(df.notes.shape)

(8178,)


In [143]:
import pickle
from sklearn.feature_extraction.text import CountVectorizer


#GET VECTOR COUNT
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(df.notes)

#SAVE WORD VECTOR
pickle.dump(count_vect.vocabulary_, open("count_vector.pkl","wb"), protocol=2)

In [144]:
from sklearn.feature_extraction.text import TfidfTransformer

#TRANSFORM WORD VECTOR TO TF IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

#SAVE TF-IDF
pickle.dump(tfidf_transformer, open("tfidf.pkl","wb"), protocol=2)

In [145]:
# Multinomial Naive Bayes

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

#clf = MultinomialNB().fit(X_train_tfidf, training_data.flag)
X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, df.label, test_size=0.25, random_state=42)
clf = MultinomialNB().fit(X_train, y_train)

#SAVE MODEL
pickle.dump(clf, open("nb_model.pkl", "wb"), protocol=2)

In [146]:
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

category_list = ["Riots", "Protests", "Violence against civilians", "Others"]

docs_new = "adsfdas fasdf adsf adsfdsf"
docs_new = [docs_new]

#LOAD MODEL
loaded_vec = CountVectorizer(vocabulary=pickle.load(open("count_vector.pkl", "rb")))
loaded_tfidf = pickle.load(open("tfidf.pkl","rb"))
loaded_model = pickle.load(open("nb_model.pkl","rb"))

X_new_counts = loaded_vec.transform(docs_new)
X_new_tfidf = loaded_tfidf.transform(X_new_counts)
predicted = loaded_model.predict(X_new_tfidf)

print(category_list[predicted[0]])

Others


In [147]:
from sklearn.neural_network import MLPClassifier

clf_neural = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15,), random_state=1)

X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, df.label, test_size=0.25, random_state=42)
print(X_test.shape)
clf_neural.fit(X_train, y_train)

(2045, 15607)


MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(15,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [148]:
pickle.dump(clf_neural, open("softmax.pkl", "wb"), protocol=2)

In [149]:
docs_new = stem("On 27 Apr, family members of a deceased woman staged a protest in Jammu city (J&K), accusing the SMGS Hospital of negligence resulting in the pregnant woman's death after doctors failed to remove a dead foetus from the woman's womb.")
docs_new = [docs_new]
X_new_counts = loaded_vec.transform(docs_new)
X_new_tfidf = loaded_tfidf.transform(X_new_counts)
predicted = clf_neural.predict(X_new_tfidf)
print(predicted, category_list[predicted[0]])
result_softmax = pd.DataFrame( {'true_labels': y_test,'predicted_labels': predicted})
result_softmax.to_csv('res_softmax.csv', sep = ',')
count = 0
print(predicted)
# for predicted_item, result in zip(predicted, y_test):
#     print(category_list[predicted_item], ' - ', category_list[result], predicted_item, result)
#     if result != predicted_item:
#         count += 1
# print(count, len(predicted))

[1] Protests


ValueError: array length 1 does not match index length 2045