In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
from mpl_toolkits.basemap import Basemap

import plotly.plotly as py
import plotly.graph_objs as go

import seaborn as sns
sns.set_style('whitegrid')

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

In [None]:
# GLT_City = pd.read_csv('../input/climate-change-earth-surface-temperature-data/GlobalLandTemperaturesByCity.csv')
# GLT_City.info()
# GLT_City.tail()

In [None]:
# import datetime
# GLT_City['dt'] = pd.to_datetime(GLT_City['dt'])

In [None]:
# GLT_City['year'], GLT_City['month'] = GLT_City['dt'].dt.year, GLT_City['dt'].dt.month
# GLT_City.head()

In [None]:
# sns.jointplot(x='month', y='AverageTemperature', data = GLT_City[(GLT_City['Country'] == 'Netherlands') | (GLT_City['year'] >= '2013')])

## Read and clean data

In [None]:
# Terrorism database
t_data = pd.read_csv('../input/gtd/globalterrorismdb_0617dist.csv', encoding='ISO-8859-1',
                    usecols=[0, 1, 2, 3, 7, 8, 9, 10, 11, 12, 13, 14, 15, 19, 20, 21, 26, 28, 29, 34, 35, 36, 58, 68, 69, 81, 98, 101, 104, 134]) 

# Fill empty cells with proper value
t_data['nkill'] = t_data['nkill'].fillna(0).astype(int)
t_data['nwound'] = t_data['nwound'].fillna(0).astype(int)
t_data['victims'] = t_data.nwound + t_data.nkill

#Population density database
popdens = pd.read_csv('../input/world-population/API_EN.POP.DNST_DS2_en_csv_v2.csv', skiprows=[0,1,2], index_col='Country Name')

#Transposing table
popdensT = popdens.drop(popdens.columns[[0,1,2,3,59,60]], axis=1)
popdensT = popdensT.T

In [None]:
t_data.columns

### RESULT
The data mainly consists of categorical data, data analysis will be concerned with counting occurences of the categorical varaibles. There are however two non-categorical integer variables, namelijk; 'nkill' and 'nwound' which play an important role in the predictions. The rows included in the analysis constitude to pointing to the location of the attacks as well as the goal and means of the attacks. Finally, the result is evaluated by the 'success'-variable and 'nkill' and 'nwound'-variables.

# ATTACKS OVER TIME
## Total number of attacks over time

In [None]:
region_dictionary = {1: 'North America', 2: 'Central America & Carribean', 3: 'South America',
                     4: 'East Asia', 5: 'Southeast Asia', 6: 'South Asia', 7: 'Central Asia',
                     8: 'Western Europe', 9: 'Eastern Europe', 10: 'Middle East and North Africa',
                     11: 'Sub-Saharan Africa', 12: 'Australasia and Oceania'}

In [None]:
def multi_graph(result,result_list, xmin, xmax, ymin, ymax):
    fig2, ax2 = plt.subplots(figsize = (15,8))
    number = 1 #the for-loop in append_list processes the regions in order from 1 to 12
    for j in result_list:
        ax2.plot(j.index, j.eventid, label = '%s ' % region_dictionary[number] )
        number += 1

    plt.xlim([xmin,xmax])
    plt.ylim([ymin,ymax])
    plt.xlabel('year')
    plt.ylabel('number of attacks')
    plt.title(result)
    ax2.legend(loc = 'center', frameon = True, edgecolor = 'black',bbox_to_anchor =(1.2,0.4))


success_list = []
failure_list = []

for i in region_dictionary:
    region_data = t_data[(t_data.region == i)]
    region_data_success = region_data[(region_data.success == 1)]
    region_data_failure = region_data[(region_data.success == 0)]
    region_grouped_success = region_data_success.groupby('iyear').count()
    region_grouped_failure = region_data_failure.groupby('iyear').count()

    
    success_list.append(region_grouped_success)
    failure_list.append(region_grouped_failure)

multi_graph('Successes',success_list, 1970, 2011, 0, 2100)
multi_graph('Successes',success_list, 2012, 2016, 0, 6500)
multi_graph('Failures',failure_list, 1970, 2011, 0, 200)
multi_graph('Failures',failure_list, 2012, 2016, 0, 1300)

### Note
The timespan has been divided in two periods, because of the following reason:
quote Michael Jensen, START, November 25, 2013: "While there is no simple answer to this question, what is certain is that by the start of the 2012 collection effort, the staff working on the GTD had become better than ever at identifying 
terrorist attacks, regardless of where they happened to occur."

### RESULT
Immediately noticable is the drop in both successfull and failed attacks in 1998. This is a phenomon shared by all regions
and should be investigated more closely. During the last 5 years (2011-2016) there is no clear increase in attacks except for North America and in some extend also South-Asia. The strong increase of failed attacks in North America could by through the sharpened measures taken after 9/11 2001. At the same time the number of successful attacks increased too and is declining since 2014 for both North America and South Asia.

## Casualties over time

In [None]:
t_data.columns

In [None]:
data_grouped_victims = t_data.groupby('iyear')['victims'].sum()

trace = go.Scatter(x = data_grouped_victims.iyear, y = data_grouped_victims.victims)

#layout = go.Layout(title = 'Victims per Year')

py.iplot(dict(data = [trace]))  

## Attacks per region successful and unsuccessful

In [None]:
#I would love to enlarge the graphs in the vertical direction however was unable to achieve this
# I tried plot2grid and subplot and regular plot however without success

def generate_graph(by_region_list):
    fig = plt.figure(figsize=(15,10))
    i = 1
    
    for element in by_region_list:
        ax1 = fig.add_subplot(11,2,i)
        ax1.set(title = '#Attacks region %s ' % region_dictionary[element[2]],
                ylabel = 'Attack count', xlabel = 'year')

        #entering data
        ax1.plot(element[0].index, element[0].eventid, label = 'Successfull attacks' )
        ax1.plot(element[1].index, element[1].eventid, label = 'Failed attacks' )
        
        i+=1
    
    #add legend
    ax1.legend(loc = 'upper center', frameon = True, edgecolor = 'black', bbox_to_anchor =(-0.1,-0.4))
    plt.show()  


def by_region():
        for region_number in region_dictionary:
            region_data = t_data[(t_data.region == region_number)] #for each region group data by year
            region_grouped_success = region_data[(region_data.success == 1)].groupby('iyear').count() #filter on success and group by year
            region_grouped_failure = region_data[(region_data.success == 0)].groupby('iyear').count() #filter on failure and group by year
            
            by_region_list.append([region_grouped_success, region_grouped_failure, region_number])
        
        #create line plot for region grouped by year
        generate_graph(by_region_list)

by_region_list = []
by_region()

### RESULT
The graphs show no clear trend through time, they show erratic lines. There are some, such parts of Asia and Africa,
that display strong increase from about 2005. The question is if this is a consequence of better documentation and
communication or that the number of attacks indeed have increased.

## Attacks per location: visual representation

In [None]:
orange_palette = ((3, 0, '#FBBC00', '1 - 20'), (4, 20, '#FDA600', '21 - 50'), (5, 50, '#EE8904', '51 - 100'), \
                  (7, 100, '#ED9001', '101 - 250'), (9, 250, '#ED6210', '251 - 600'), \
                  (11, 600, '#DE6D0A', '601 - 1000'), (13, 1000, '#D8510F', '1001 - 2000'), \
                  (15, 2000, '#D23711', '2001 - 4000'), (18, 4000, '#F61119', '4001 - 7500'), \
                  (30, 7500, '#9C200A', '7501 - ∞')) #marker size, count size, color

plt.figure(figsize=(15,15))
# Rounds the long- and latitude to a number withouth decimals, groups them on long- and latitude and counts the amount of attacks.
df_coords = t_data.round({'longitude':0, 'latitude':0}).groupby(["longitude", "latitude"]).size().to_frame(name = 'count').reset_index()
m = Basemap(projection='mill',llcrnrlat=-80,urcrnrlat=80, llcrnrlon=-180,urcrnrlon=180,lat_ts=20,resolution='c')
m.drawcoastlines()
m.shadedrelief()
    
def plot_points(marker_size, count_size, colour, label_count):
    x, y = m(list(df_coords.longitude[df_coords['count'] >= count_size].astype("float")),\
                (list(df_coords.latitude[df_coords['count'] >= count_size].astype("float"))))
    points = m.plot(x, y, "o", markersize = marker_size, color = colour, label = label_count, alpha = .5)

for p in orange_palette:
    plot_points(p[0], p[1], p[2], p[3]) 
    
plt.title("Amount of terrorist attacks per rounded coordinates", fontsize=24)
plt.legend(title= 'Colour per counted attack', loc ='lower left', prop= {'size':11})
plt.show()

### Note
The heatmap is is created by rounding cordinates.

In [None]:
sns.jointplot(x='longitude', y='latitude', data=df_coords, kind="hex", color="#4CB391", size=15, stat_func=None, edgecolor="#EAEAF2", linewidth=.2)
plt.title('Amount of terrorist attacks per rounded coordinates')

### RESULT
The images above show where the documented attacks are concentrated. The worldmap gives a more clear view on where in the world there are more attacks, the pentagram-representation substanciates that and add barcharts for both longitude and latitude to infer attack-intensity.

## Attacks over time versus population density
Is the population density for a certain area determining for the intensity of terroristic attacks.

In [None]:
# Group countries by region as defined in the Terrorism database
North_America = ('Canada','Mexico',"United States")
Central_America_Caribbean = ('Antigua', 'Barbuda', 'Bahamas', 'Barbados', 'Belize', 'Cayman Islands', 'Costa Rica',
                             'Cuba', 'Dominica', 'Dominican Republic', 'El Salvador', 'Grenada', 'Guadeloupe', 'Guatemala',
                             'Haiti', 'Honduras', 'Jamaica', 'Martinique', 'Nicaragua', 'Panama', 'St. Kitts', 'St. Nevis',
                             'St. Lucia', 'Trinidad', 'Tobago')
South_America = ('Argentina', 'Bolivia', 'Brazil', 'Chile', 'Colombia', 'Ecuador', 'Falkland Islands', 'French Guiana', 'Guyana',
                 'Paraguay', 'Peru', 'Suriname', 'Uruguay', 'Venezuela')
East_Asia = ('China', 'Hong Kong', 'Japan', 'Macau', 'North Korea', 'South Korea', 'Taiwan')
Southeast_Asia = ('Brunei', 'Cambodia', 'East Timor', 'Indonesia', 'Laos', 'Malaysia', 'Myanmar', 'Philippines', 'Singapore',
                  'South Vietnam', 'Thailand', 'Vietnam')
South_Asia = ('Afghanistan', 'Bangladesh', 'Bhutan', 'India', 'Maldives', 'Mauritius', 'Nepal', 'Pakistan', 'Sri Lanka')
Central_Asia = ('Armenia', 'Azerbaijan', 'Georgia', 'Kazakhstan', 'Kyrgyzstan', 'Tajikistan', 'Turkmenistan', 'Uzbekistan')
Western_Europe = ('Andorra', 'Austria', 'Belgium', 'Cyprus', 'Denmark', 'Finland', 'France', 'Germany', 'Gibraltar', 'Greece',
                  'Iceland', 'Ireland', 'Italy', 'Luxembourg', 'Malta', 'Netherlands', 'Norway', 'Portugal', 'Spain', 'Sweden',
                  'Switzerland', 'United Kingdom', 'Vatican City', 'West Germany (FRG)')
Easern_Europe = ('Albania', 'Belarus', 'Bosnia-Herzegovina', 'Bulgaria', 'Croatia', 'Czech Republic', 'Czechoslovakia',
                 'East Germany (GDR)', 'Estonia', 'Hungary', 'Kosovo', 'Latvia', 'Lithuania', 'Macedonia', 'Moldova',
                 'Montenegro', 'Poland', 'Romania', 'Russia', 'Serbia', 'Serbia-Montenegro', 'Slovak Republic', 'Slovenia',
                 'Soviet Union', 'Ukraine', 'Yugoslavia')
Middle_East_North_Africa = ('Algeria', 'Bahrain', 'Egypt', 'Iran', 'Iraq', 'Israel', 'Jordan', 'Kuwait', 'Lebanon', 'Libya',
                            'Morocco', 'North Yemen', 'Qatar', 'Saudi Arabia', 'South Yemen', 'Syria', 'Tunisia', 'Turkey',
                            'United Arab Emirates', 'West Bank and Gaza Strip', 'Western Sahara', 'Yemen')
Sub_Saharan_Africa = ('Angola', 'Benin', 'Botswana', 'Burkina Faso', 'Burundi', 'Cameroon', 'Central' 'African Republic', 'Chad',
                      'Comoros', 'Democratic Republic of the Congo', 'Djibouti', 'Equatorial Guinea', 'Eritrea', 'Ethiopia', 'Gabon',
                      'Gambia', 'Ghana', 'Guinea', 'Guinea-Bissau', 'Ivory Coast', 'Kenya', 'Lesotho', 'Liberia', 'Madagascar', 'Malawi',
                      'Mali', 'Mauritania', 'Mozambique', 'Namibia', 'Niger', 'Nigeria', 'Republic of the Congo', 'Rhodesia', 'Rwanda',
                      'Senegal', 'Seychelles', 'Sierra Leone', 'Somalia', 'South Africa', 'South Sudan', 'Sudan', 'Swaziland', 'Tanzania',
                      'Togo', 'Uganda', 'Zaire', 'Zambia', 'Zimbabwe')
Australasia_Oceania = ('Australia', 'Fiji', 'French Polynesia', 'New Caledonia', 'New Hebrides', 'New Zealand', 'Papua New Guinea',
                       'Solomon Islands', 'Vanuatu', 'Wallis and Futuna')

regions = [North_America,Central_America_Caribbean,South_America,East_Asia,Southeast_Asia,South_Asia,
           Central_Asia,Western_Europe,Easern_Europe,Middle_East_North_Africa,Sub_Saharan_Africa,
           Australasia_Oceania]

In [None]:
#? merging on country per year of lookup
# 55 landen, 1960 - 2016
# delete NAN en wat is World?

# hieronder: toevoegen van regio nummer bij een land

popdensTry = popdens

def check_region():
    for i in regions:
        if popdensTry['Country Name'] in i: # if column name in region
            popdensTry['Region'] = regions.index(i)+1



    # then group by region and sum population

In [None]:
#popdens[popdens.columns[0]]
popdensT

In [None]:
def plot_graph():
    


popdensT.Mexico.plot(legend=True)
#plot terrorist attack data per country to compare with population density:
#plot specific attacktype for a specific country
#example: bombings in Mexico
country_specific_attacks = t_data[(t_data.country_txt == 'Mexico')]
country_specific_attacks = country_specific_attacks[(country_specific_attacks.attacktype1_txt == ('Bombing/Explosion'))]
country_specific_attacks = country_specific_attacks.set_index('country_txt')
plot_spec_att = country_specific_attacks.groupby(['iyear','country_txt']).size().reset_index(name="Count")
plot_spec_att.groupby('country_txt').plot(x='iyear', y='Count', legend=False)

# Attacks per unit time
## Per month
This graph was plotted to find an answer to the question, is there a corrilation between timing and terrorism. This graph shows the amount of terrorist attacks per day of the month and it clarifies that it does not really matter what day of the month it is the amount of terrorist attacks stay the same. For the 31th day of the month the amount of attacks are halved but that day is also only in halve of the months.

In [None]:
df_day_coords = t_data[['imonth', 'iday', 'longitude', 'latitude', 'success']].copy()[(t_data['iday'] != 0) & (t_data['imonth'] != 0)]
fig, ax = plt.subplots(figsize=(14,10))
sns.countplot(x="iday", data=df_day_coords, ax=ax, palette=sns.cubehelix_palette(15, start=.3, rot=.3))
ax.set_xlabel('Day of the month')
ax.set_ylabel('Amount of terrorist attacks')

## Per day

In [None]:
fig, axs = plt.subplots(nrows=12)
fig.set_size_inches(15, 100, forward=True)

for i in range(1,13):
    monthly_data = df_day_coords[df_day_coords['imonth'] == i]
    sns.countplot(x="iday", data=monthly_data, hue="success", ax=axs[i-1])
    axs[i-1].set_xlabel('Day of the month')
    axs[i-1].set_ylabel('Amount of terrorist attacks')

In [None]:
succes_month = sns.factorplot(x="imonth", hue="success", 
                                  kind="count", data=df_day_coords, size=10, palette="muted")

# General Descriptives
## Choice of attacktype

In [None]:
plt.figure(figsize=(9,7))
ax = sns.countplot(y="attacktype1_txt", data=t_data)
ax.set_xlabel("Amount of type")
ax.set_ylabel("Attack type")

## d

## Merging an external database
The main question of the research is: __How does timing correlate to the execution of a terrorist attack?__
Under timing a broad range of variables is understood, examples being time of day, climate, tourism and weather.

A weather database was therefore downloaded and merged to attempt to find connections between weather conditions and terrorist attacks. It contains daily snapshots (taken at 12:00 PM) from January 1979 to July 2017 of the following weather conditions all over the world:

* 2 metre temperature
* Total cloud cover
* Vertical integral of divergence of geopotential
* Surface pressure
* 10 metre V wind component

Each row in the terrorist dataset contains a date (*iyear, imonth, iday*) and coordinates (*lat, long*). These variables were used to index the 3-dimensional weather data, and the resulting values were appended to the corresponding row in the terrorist dataset. In the end, for a given terrorist attack, the weather at that approximate place could be retrieved. Do note that the time at which an attack was executed, is not recorded in the database. The actual weather at that time might therefore deviate from the recorded weather (at 12:00 PM). Also, as mentioned earlier, the weather location is an approximation of the precise attack location, as the weather coordinates follow steps of 0.75, whereas the terrorist locations are exact.

The script used to merge the databases can unfortunately not be executed on Kaggle. It depends on a library called *NetCDF4* to put the weather data in a workable format. The script used can however be found via this link: https://github.com/DaanSterk/merge_terrorism_weather/blob/master/merge.py.

In [None]:
# The full weather NetCDF was too large to merge. Therefore, only the last five years were merged.

#db = t_set = pd.read_csv('terrorist_weather_jan2012_jul2017.csv', encoding='ISO-8859-1')

#db.tail()

In [None]:
# this will remove warnings messages
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import ExtraTreesClassifier

In [None]:
plt.figure(figsize=(20,10));
plt.scatter(X[:,0],X[:,1],c=y,s=200)
plt.title("Linearly Separated Data");
plt.xticks([]);
plt.yticks([]);