In [None]:
## Importing Packages

import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
import scipy.stats as stats
import os, sys, operator, warnings


# Scikit-learn Auxiliary Modules
from sklearn.metrics import accuracy_score, auc, classification_report, confusion_matrix
from sklearn.metrics import explained_variance_score, f1_score, mean_squared_error, mean_absolute_error
from sklearn.metrics import precision_recall_curve, precision_score, r2_score, roc_auc_score, roc_curve
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict
from sklearn.model_selection import KFold, learning_curve, StratifiedKFold, train_test_split, validation_curve 
from sklearn.feature_selection import chi2, f_classif, SelectKBest
from sklearn.preprocessing import StandardScaler, PolynomialFeatures 
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline 


# Scikit-learn Classification Models
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB


# Natural Language Processing
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, CountVectorizer, TfidfVectorizer
from textblob import TextBlob, Word, WordList 



# Plotly 
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff

py.offline.init_notebook_mode(connected=True)

# Other imports
import itertools
# import pprint
import patsy

# Setting some styles and options
sns.set_style('whitegrid') 
pd.options.display.max_columns = 40 

%config InlineBackend.figure_format = 'retina'
 
%matplotlib inline

print('Packages Imported Successfully!')

Packages Imported Successfully!


In [None]:
from google.colab import files
uploaded = files.upload()

Saving globalterrorismdb_0718dist.csv to globalterrorismdb_0718dist.csv


In [None]:
import io
data = pd.read_csv(io.BytesIO(uploaded['globalterrorismdb_0718dist.csv']),low_memory = False, encoding='ISO-8859-1')
print('Data Loaded Successfuly!')

Data Loaded Successfuly!


In [None]:
data_columns = [
    
    ## Spatio-Temporal Variables:
                'iyear', 'imonth', 'iday', 'latitude', 'longitude',
    
    ## Binary Variables: 
                'extended', 'vicinity', 'crit1', 'crit2', 'crit3', 'doubtterr',
                'multiple', 'success', 'suicide', 'guncertain1', ## check back guncertain
                'claimed', 'property', 'ishostkid',
    
    ## Continuous Variables:
                'nkill', 'nwound',               
    
    ## Categorical variables (textual): 
                'country_txt', 'region_txt', 'alternative_txt', 'attacktype1_txt', 'targtype1_txt',
                'natlty1_txt', 'weaptype1_txt', 
    
    ## Descriptive Variables: 
                'target1', 'gname', 'summary',    
    
                                            ]

gtd = data.loc[:, data_columns]

# To avoid confusion, we restrict the dataset to only attacks that were of terrorist nature.

gtd = gtd[(gtd.crit1 == 1) & (gtd.crit2 == 1) & (gtd.crit3 == 1) & (gtd.doubtterr == 0)]

In [None]:
gtd.describe()

Unnamed: 0,iyear,imonth,iday,latitude,longitude,extended,vicinity,crit1,crit2,crit3,doubtterr,multiple,success,suicide,guncertain1,claimed,property,ishostkid,nkill,nwound
count,138879.0,138879.0,138879.0,136000.0,136000.0,138879.0,138879.0,138879.0,138879.0,138879.0,138879.0,138878.0,138879.0,138879.0,138626.0,96823.0,138879.0,138718.0,132111.0,127925.0
mean,2003.965668,6.461474,15.519323,23.871581,30.661553,0.049964,0.063487,1.0,1.0,1.0,0.0,0.154294,0.882567,0.038501,0.093597,0.045,-0.638304,0.062306,2.254339,3.550072
std,12.869845,3.395808,8.807931,18.290251,55.955341,0.217872,0.284328,0.0,0.0,0.0,0.0,0.361231,0.321937,0.192403,0.291269,1.081471,3.228839,0.470398,10.931702,40.660962
min,1970.0,0.0,0.0,-53.154613,-157.858333,0.0,-9.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,-9.0,-9.0,-9.0,0.0,0.0
25%,1992.0,4.0,8.0,11.840929,8.737554,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2010.0,6.0,15.0,31.6364,44.004623,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
75%,2014.0,9.0,23.0,34.597704,69.147011,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.0,2.0
max,2017.0,12.0,31.0,74.633553,179.366667,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1384.0,8191.0


In [None]:
print ('9/11 attacks:')
gtd[(gtd.iyear == 2001) & (gtd.imonth == 9) & (gtd.iday == 11) & (gtd.country_txt == 'United States')]

9/11 attacks:


Unnamed: 0,iyear,imonth,iday,latitude,longitude,extended,vicinity,crit1,crit2,crit3,doubtterr,multiple,success,suicide,guncertain1,claimed,property,ishostkid,nkill,nwound,country_txt,region_txt,alternative_txt,attacktype1_txt,targtype1_txt,natlty1_txt,weaptype1_txt,target1,gname,summary
73126,2001,9,11,40.697132,-73.931351,0,0,1,1,1,0.0,1.0,1,1,0.0,1.0,1,1.0,1384.0,8190.0,United States,North America,,Hijacking,Private Citizens & Property,United States,Vehicle (not to include vehicle-borne explosiv...,Passengers and crew members on American Airlin...,Al-Qaida,09/11/2001: This was one of four related attac...
73127,2001,9,11,40.697132,-73.931351,0,0,1,1,1,0.0,1.0,1,1,0.0,1.0,1,1.0,1383.0,8191.0,United States,North America,,Hijacking,Private Citizens & Property,United States,Vehicle (not to include vehicle-borne explosiv...,Passengers and crew members on United Airlines...,Al-Qaida,09/11/2001: This was one of four related attac...
73128,2001,9,11,38.878742,-77.100006,0,0,1,1,1,0.0,1.0,1,1,0.0,1.0,1,1.0,190.0,106.0,United States,North America,,Hijacking,Government (General),United States,Vehicle (not to include vehicle-borne explosiv...,Passengers and crew members on American Airlin...,Al-Qaida,09/11/2001: This was one of four related attac...
73129,2001,9,11,40.018464,-78.907197,0,1,1,1,1,0.0,1.0,1,1,0.0,1.0,1,1.0,44.0,6.0,United States,North America,,Hijacking,Private Citizens & Property,United States,Vehicle (not to include vehicle-borne explosiv...,Airline passengers and crew members on board A...,Al-Qaida,09/11/2001: This was one of four related attac...


In [None]:
gtd.weaptype1_txt.replace(
    'Vehicle (not to include vehicle-borne explosives, i.e., car or truck bombs)',
    'Vehicle', inplace = True)

In [None]:
 gtd.iloc[:,[6, 15, 16, 17]] = gtd.iloc[:,[6, 15, 16, 17]].replace(-9,0)

In [None]:
gtd.claimed.replace(2,1, inplace = True) # (3)

In [None]:
gtd.target1 = gtd.target1.str.lower()
gtd.gname = gtd.gname.str.lower()
gtd.summary = gtd.summary.str.lower()    
gtd.target1 = gtd.target1.fillna('unknown').replace('unk','unknown')  

In [None]:
gtd.nkill = np.round(gtd.nkill.fillna(gtd.nkill.median())).astype(int) 
gtd.nwound = np.round(gtd.nwound.fillna(gtd.nwound.median())).astype(int) 

In [None]:
gtd['casualties'] = gtd.nkill + gtd.nwound
gtd['nclass'] = gtd.casualties.apply(lambda x: 0 if x == 0 else 1) 

In [None]:
def categorize_perpetrators(column):
    '''
    This function reorganizes perpetrator groups based on their value_counts, perpetrator groups with
    less than 10 occurences are re-assigned to a new category called 'small_time_perpetrator'
    Parameter is of the type <pandas.core.series.Series>
    '''
    perpetrators_count = column.value_counts()
    small_time_perpetrator = perpetrators_count[perpetrators_count < 10].index.tolist()
    column = column.apply(lambda x: 'small time perpetrator' if x in small_time_perpetrator else x).astype(str)
    return column

In [None]:
gtd.gname = categorize_perpetrators(gtd.gname)
print('Perpetrators categorized!')

Perpetrators categorized!


In [None]:
def categorize_target1(column):
    '''
    This function performs three operations:
    - It uses TextBlop in order to lemmatize (e.g. transform a word into its cannonical form) the textual data,
    for example, converting 'civilians' to 'civilian'. This enables us to increase the value count for recurrent
    words.
    - The second part of the function defines a list of top_targets, which include targets mentioned more than
    50 times. It then loops through every target string and re-assigns sentences that contain top_targets words.
    - Finally, it assigns every target not in top_targets to a new 'isolated target' category.
    Parameter is of the type <pandas.core.series.Series>
    '''
    
    temp_target = []
    for target in column:
        blob = TextBlob(target)
#         blob.ngrams = 2
        blop = blob.words
        lemma = [words.lemmatize() for words in blop]
        temp_target.append(" ".join(lemma))
    column = pd.Series(temp_target, index = column.index)
    target_count = column.value_counts()
    top_targets = target_count[target_count > 50].index.tolist()
    for item in top_targets: 
        column = column.apply(lambda x: item if item in x else x)
    column = column.apply(lambda x: 'isolated target' if x not in top_targets else x)
    return column

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:

gtd.target1 = categorize_target1(gtd.target1)
print('Targets categorized!')

Targets categorized!


In [None]:
print ('missing data : \n')
print (gtd.drop(['latitude','longitude','summary'], axis = 1).isnull().sum().sort_values(ascending = False).head(4))

missing data : 

alternative_txt    138873
claimed             42056
natlty1_txt          1324
guncertain1           253
dtype: int64


In [None]:
df = gtd.drop(['longitude','latitude', 'summary'], axis =1)

In [None]:
df.shape

(138879, 29)

In [None]:
df.guncertain1.fillna(0, inplace = True)
df.ishostkid.fillna(0, inplace = True)

In [None]:
y_temp = df.claimed
y_temp.shape

(138879,)

In [None]:
categorical = ['country_txt', 'alternative_txt', 'attacktype1_txt',
               'targtype1_txt', 'weaptype1_txt', 'gname', 'target1']

numerical = ['extended', 'vicinity', 'multiple', 'success',
             'suicide', 'guncertain1', 'casualties', 'property', 'ishostkid',]

In [None]:
formula =  ' + '.join(numerical)+ ' + ' + ' + '.join(['C('+i+')' for i in categorical]) + ' -1' 
formula

'extended + vicinity + multiple + success + suicide + guncertain1 + casualties + property + ishostkid + C(country_txt) + C(alternative_txt) + C(attacktype1_txt) + C(targtype1_txt) + C(weaptype1_txt) + C(gname) + C(target1) -1'

In [None]:
X_temp = patsy.dmatrix(formula, data = df, return_type= 'dataframe')
print(X_temp.shape, y_temp.shape)

(6, 855) (138879,)


In [None]:
X_train = X_temp[~y_temp.isnull()]
X_test = X_temp[y_temp.isnull()]


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.



In [None]:
y_train = y_temp[~y_temp.isnull()]
y_test = y_temp[y_temp.isnull()]

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((6, 855), (96823,), (0, 855), (42056,))

In [None]:

X_train = X_train.values.reshape(X_train.shape[:])

Logistic Regression

In [None]:
lr = LogisticRegression(random_state = 42).fit(X_train, y_train) 

In [None]:
predictions = pd.Series(lr.predict(X_test), index = X_test.index)

In [None]:
df.claimed.fillna(predictions, inplace = True)

Exploratory Data Analysis

In [None]:
trace = dict(
    type = 'choropleth',
    locationmode = 'country names',
    locations = cpc['country_txt'],

    z = cpc['casualties'],
    name = 'Casualties',
    text = cpc['country_txt'].astype(str) + '<br>' + cpc['casualties'].astype(str),
    hoverinfo = 'text+name',
    autocolorscale = False,
    colorscale = 'Viridis',
#     reversescale = True,
    marker = dict( line = dict ( color = 'rgb(255,255,255)', width = 0.5))
        
    )
        

layout = dict(
    title = 'Cummulative Casualties World Map from 1970 until 2015 ',
    geo = dict( showframe = False, showcoastlines = True,
               projection = dict(type = 'Mercator'), showlakes = True,
               lakecolor = 'rgb(255, 255, 255)'       
              )
    )
    

py.iplot(dict( data=[trace], layout=layout ))

In [None]:
cpy = df.groupby('iyear', as_index=False)['casualties'].sum()

trace = go.Scatter(x = cpy.iyear, y = cpy.casualties,
                   name = 'Casualties', line = dict(color = 'salmon', width = 4, dash ='dot'),
                   hoverinfo = 'x+y+name')

layout = go.Layout(title = 'Casualties per Year')

py.iplot(dict(data = [trace], layout = layout))  

In [None]:
cpr = df.groupby('region_txt', as_index= False)['casualties'].sum()
apr = df.groupby('region_txt')['region_txt'].count()

trace_1 = go.Bar(x = cpr.region_txt, y = cpr.casualties,
                 marker = dict(color = 'rgb(100, 229, 184)'),
                 name = 'Casualties')

trace_2 = go.Bar(x = apr.index, y = apr,
                 marker = dict(color = 'rgb(255, 188, 214)'),
                 name = 'Terror Attacks')

layout = go.Layout(title = "Total Casualties and Terror Attacks by Region", barmode='group' )


py.iplot(dict(data = [trace_1,trace_2], layout = layout))

In [None]:
### Top 10 countries by attack/fatalities
apc = df.groupby('country_txt')['country_txt'].count().sort_values(ascending= False)
cpc = df.groupby('country_txt', as_index= False)['casualties'].sum().sort_values(by = 'casualties', ascending= False)
cc = pd.merge(pd.DataFrame(apc), cpc, on = 'country_txt')


trace = go.Bar(x = apc.index[:20],y = apc,
                 marker = dict(color = 'rgb(255, 188, 214)'),
                 name = 'Terror Attacks')

layout = go.Layout(title = 'top 20 most targeted countries', barmode='relative' )

py.iplot(dict(data = [trace], layout = layout)) 

In [None]:
y = df.casualties.apply(lambda x: 0 if x == 0 else 1).values

In [None]:
numerical = ['extended', 'vicinity', 'multiple', 'success', 'claimed',
             'suicide', 'guncertain1', 'property', 'ishostkid','natlty1_txt']

categorical = ['country_txt', 'alternative_txt', 'attacktype1_txt',
              'targtype1_txt', 'weaptype1_txt', 'gname', 'target1']


In [None]:
formula =  ' + '.join(numerical)+ ' + ' + ' + '.join(['C('+i+')' for i in categorical]) + ' -1' 
formula

In [None]:
X = patsy.dmatrix(formula, data = df, return_type= 'dataframe') 

In [None]:
print X.shape, y.shape

In [None]:
X.head(2)   

In [None]:
pca_model = PCA(n_components=len(X.columns)) 
pca = pca_model.fit(X)

In [None]:
var_ratio = pca.explained_variance_ratio_
var_ratio = np.cumsum(var_ratio)
plot_cumsum_variance(var_ratio)

In [None]:
X_columns = list(X.columns) #Here we transfrom our variables into a list

#We then apply a chi2 statistical measure
skb_chi2 = SelectKBest(chi2, k=20)
skb_chi2.fit(X, y)

# examine results
top_15_chi2 = pd.DataFrame([X_columns, list(skb_chi2.scores_)], 
                     index=['feature','chi2 score']).T.sort_values('chi2 score', ascending=False)[:15]
top_15_chi2

In [None]:
plt.figure(figsize=(13,6))

sns.barplot(x = top_15_chi2['chi2 score'], y = top_15_chi2.feature, palette= 'viridis')
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 102)

print X_train.shape, y_train.shape, X_test.shape, y_test.shape

Logistic Regression

In [None]:
y_train = y_temp[~y_temp.isnull()]
y_test = y_temp[y_temp.isnull()]

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
lr = LogisticRegression(random_state = 42).fit(X_train, y_train) 

In [None]:
predictions = pd.Series(lr.predict(X_test), index = X_test.index)

In [None]:
df.claimed.fillna(predictions, inplace = True)

In [None]:
lr = LogisticRegression(random_state = 56, n_jobs = -1, penalty = 'l1')

lr_params = {
    'C': np.linspace(0.001, 1, 20),

In [None]:
lr_grid = GridSearchCV(lr, lr_params, scoring = 'recall', cv = 5, n_jobs = -1, error_score = 0)

In [None]:
lr_grid.fit(X_train, y_train)

In [None]:
lr_best_estimator = lr_grid.best_estimator_

print 'best estimator: \n', lr_grid.best_estimator_

print '\naccuracy_score: \n', lr_grid.score(X_test, y_test)

print '\nbest_params: \n', lr_grid.best_params_

In [None]:
lr_results = pd.DataFrame(lr_grid.cv_results_).sort_values(by = 'param_C')

In [None]:
lr_results.head(3)

In [None]:
lr_results.plot(x ='param_C', y = 'mean_test_score');

In [None]:
lr_score = cross_val_score(lr_grid.best_estimator_, X_test, y_test, cv = 10, scoring = 'recall').mean()
lr_score

Random Forest Algorithm

In [None]:
rf = RandomForestClassifier(random_state = 56, n_jobs = -1, n_estimators= 300)

rf_params = {
    
    'criterion': ['gini','entropy'],
    'max_features' : ['auto', 'sqrt'],
}

In [None]:
rf_grid = GridSearchCV(rf, rf_params, scoring = 'recall', cv = 5, n_jobs = -1, error_score= 0)

In [None]:
rf_grid.fit(X_train, y_train)

In [None]:
rf_best_estimator =rf_grid.best_estimator_
print rf_grid.best_estimator_
print
print rf_grid.score(X_test, y_test)
print
print rf_grid.best_params_

In [None]:
rf_results = pd.DataFrame(rf_grid.cv_results_).sort_values(by = 'rank_test_score')

In [None]:
rf_results.head(3)  

In [None]:
%store rf_results
%store rf_best_estimator

In [None]:
rf_score = cross_val_score(rf_grid.best_estimator_, X_test, y_test, cv = 10, scoring = 'recall', n_jobs = -1).mean()
rf_score

In [None]:
%store -r rf_best_estimator

Support Vector Classifier

In [None]:
 svm = LinearSVC(random_state = 56, penalty = 'l1', dual = False)
    svm_params = {

        'C': np.linspace(0.001, 10, 15),

    }

In [None]:
svm_grid = GridSearchCV(svm, svm_params, cv = 5, scoring = 'recall', n_jobs = -1, error_score = 0)

In [None]:
warnings.filterwarnings('ignore')

svm_grid.fit(X_train, y_train)

In [None]:
warnings.filterwarnings('default')

In [None]:
svm_best_estimator = svm_grid.best_estimator_
%store svm_best_estimator
print svm_grid.best_estimator_
print
print svm_grid.score(X_test, y_test)

print svm_grid.best_params_

In [None]:
svm_results = pd.DataFrame(svm_grid.cv_results_).sort_values(by = 'param_C')

In [None]:
svm_results.head(3) 

In [None]:
svm_results.plot(x = 'param_C', y = 'mean_test_score');

In [None]:
svm_score = cross_val_score(svm_grid.best_estimator_, X_test, y_test, scoring = 'recall', cv = 10, n_jobs = -1).mean()
svm_score

In [None]:
%store svm_score
%store svm_results