In [1]:
#Importing libraries
import numpy as np
import pandas as pd
import tensorflow as tf

from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  from numpy.core.umath_tests import inner1d


In [2]:
#Label Encoder to encode names of Terrorist Organisations
label_encoder = LabelEncoder()

In [3]:
#Scaling values for higher accuracy
scale_values  = StandardScaler()

In [4]:
# Create classifiers for processing
rf = BaggingClassifier(max_features=7)
et = ExtraTreesClassifier()
knn = KNeighborsClassifier()
svc = SVC()
rg = RidgeClassifier()
gnb = BernoulliNB()

In [5]:
#Importing and reaading Global Terrorist Database using Pandas
excel_file = '../..Dataset/gtd_14to17_0718dist.xlsx'
Raw_Dataset = pd.read_excel(excel_file)
Raw_Dataset.head()

Unnamed: 0,eventid,iyear,imonth,iday,approxdate,extended,resolution,country,country_txt,region,...,addnotes,scite1,scite2,scite3,dbsource,INT_LOG,INT_IDEO,INT_MISC,INT_ANY,related
0,201401010001,2014,1,1,,0,,45,Colombia,3,...,,"""ELN bomb Colombia oil pipeline infrastructure...","""Colombia Guerrilla Update: ELN Blows Up Crude...",,START Primary Collection,0,0,0,0,"201401010001, 201401010055, 201401010056, 2014..."
1,201401010002,2014,1,1,,0,,182,Somalia,11,...,Casualty numbers for this attack represent an ...,"""11 dead, 40 injured in bomb attack in Somalia...","""Somalia twin bombing toll rises to 11: police...","""Somalia: Twin Suicide Attack at Mogadishu Hot...",START Primary Collection,0,0,0,0,"201401010002, 201401010039, 201401010040"
2,201401010003,2014,1,1,,0,,153,Pakistan,6,...,Casualty numbers for this attack conflict acro...,"""Pakistan car bombing kills Shia pilgrims,"" Al...","""Suicide blast hits pilgrims' bus,"" Dawn (Paki...","""Two dead, 31 injured in bus bombing in Pakist...",START Primary Collection,0,0,0,0,
3,201401010004,2014,1,1,,0,,153,Pakistan,6,...,,"""Highlights: Pakistan Balochistan Press 2 Janu...",,,START Primary Collection,-9,-9,0,-9,
4,201401010005,2014,1,1,,1,,182,Somalia,11,...,,"""Somalia: Al-Shabaab Militants Free Kidnapped ...","""SOCAFRICA: Al-Shabaab Incident Tracker, 29 De...",,START Primary Collection,0,0,0,0,


In [6]:
#We will focus on those attacks only which were confirmed to be perpetrated by terrorist organisation
Dataset = Raw_Dataset[(Raw_Dataset.crit1 == 1) & 
                      (Raw_Dataset.crit2 == 1) & 
                      (Raw_Dataset.crit3 == 1) & 
                      (Raw_Dataset.doubtterr == 0)]

In [7]:
#Importing columns for dataset preparation
features = Dataset.loc[:,['extended',
                          'country',
                          'region',
                          'vicinity',
                          'attacktype1',
                          'targtype1',
                          'natlty1',
                          'weaptype1',
                          'weapsubtype1',
                          'summary',
                          'gname']]

In [8]:
#Dropping rows that are empty 
features = features.dropna() 

In [9]:
#Preparing target values and labeling
label_encoder.fit(features['gname'])
target = label_encoder.fit_transform(features['gname'])
#Y  = X.loc[:,['gname']]

In [10]:
#Here we tokenize words in summary to make it a feature
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(features['summary'])

x_seq = tokenizer.texts_to_sequences(features["summary"])

features_summary = sequence.pad_sequences(x_seq, maxlen=100, padding="post", value=0)

In [11]:
#We will now convert pandas dataframe to numpy arrays for processing 
features = features.values
#Y = Y.values

In [12]:
#values are scaled for more accuracy 
features_summary = scale_values.fit_transform(features_summary)
features[:,:-2] = scale_values.fit_transform(features[:,:-2])
#Y = Y.reshape(-1,1)
#Y = scaler.fit_transform(Y)



In [13]:
#Array with tokenize summary and array with other features are combined here to generate a single array
features = np.concatenate((features[:,:-2],features_summary), axis = 1)

In [14]:
#spliting of array to train and test samples
x_train, x_test, y_train, y_test = train_test_split(features, 
                                                    target, 
                                                    test_size=0.20, 
                                                    random_state=42)
#shapes are displayed of repective arrays
print (x_train.shape,y_train.shape)
print (x_test.shape,y_test.shape)

(32490, 109) (32490,)
(8123, 109) (8123,)


In [15]:
#Bagging Classifier as Model
rf.fit(x_train,y_train)
rf.score(x_test,y_test)*100

69.19857195617382

In [16]:
#Extra Tree Classifier as Model
et.fit(x_train,y_train)
et.score(x_test,y_test)*100

76.80659854733473

In [17]:
#KNN as Model
knn.fit(x_train,y_train)
knn.score(x_test,y_test)*100

54.967376585005546

In [18]:
#SVM or Support Vector Machine
svc.fit(x_train,y_train)
svc.score(x_test,y_test)*100

56.653945586605936

In [19]:
#Ridge Classifier as Model
rg.fit(x_train,y_train)
rg.score(x_test,y_test)*100

50.473962821617626

In [20]:
gnb.fit(x_train,y_train)
gnb.score(x_test,y_test)*100

55.52135910377939