## Notebook Imports

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import opendatasets as od
import os

from mpl_toolkits import basemap

from sklearn.preprocessing import LabelEncoder

from warnings import filterwarnings
filterwarnings("ignore")

#### Loading Dataset

In [2]:
url = 'https://www.kaggle.com/datasets/sandeepkumar69/terrorism-data'

od.download(url)

Skipping, found downloaded files in ".\terrorism-data" (use force=True to force download)


In [3]:
dir = './terrorism-data/'

os.listdir(dir)

['globalterrorismdb_0718dist.csv']

In [4]:
path = dir + 'globalterrorismdb_0718dist.csv'

In [5]:
df = pd.read_csv(path, encoding='latin-1')
df.head()

Unnamed: 0,eventid,iyear,imonth,iday,approxdate,extended,resolution,country,country_txt,region,...,addnotes,scite1,scite2,scite3,dbsource,INT_LOG,INT_IDEO,INT_MISC,INT_ANY,related
0,197000000001,1970,7,2,,0,,58,Dominican Republic,2,...,,,,,PGIS,0,0,0,0,
1,197000000002,1970,0,0,,0,,130,Mexico,1,...,,,,,PGIS,0,1,1,1,
2,197001000001,1970,1,0,,0,,160,Philippines,5,...,,,,,PGIS,-9,-9,1,1,
3,197001000002,1970,1,0,,0,,78,Greece,8,...,,,,,PGIS,-9,-9,1,1,
4,197001000003,1970,1,0,,0,,101,Japan,4,...,,,,,PGIS,-9,-9,1,1,


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181691 entries, 0 to 181690
Columns: 135 entries, eventid to related
dtypes: float64(55), int64(22), object(58)
memory usage: 187.1+ MB


In [7]:
df.describe()

Unnamed: 0,eventid,iyear,imonth,iday,extended,country,region,latitude,longitude,specificity,...,ransomamt,ransomamtus,ransompaid,ransompaidus,hostkidoutcome,nreleased,INT_LOG,INT_IDEO,INT_MISC,INT_ANY
count,181691.0,181691.0,181691.0,181691.0,181691.0,181691.0,181691.0,177135.0,177134.0,181685.0,...,1350.0,563.0,774.0,552.0,10991.0,10400.0,181691.0,181691.0,181691.0,181691.0
mean,200270500000.0,2002.638997,6.467277,15.505644,0.045346,131.968501,7.160938,23.498343,-458.6957,1.451452,...,3172530.0,578486.5,717943.7,240.378623,4.629242,-29.018269,-4.543731,-4.464398,0.09001,-3.945952
std,1325957000.0,13.25943,3.388303,8.814045,0.208063,112.414535,2.933408,18.569242,204779.0,0.99543,...,30211570.0,7077924.0,10143920.0,2940.967293,2.03536,65.720119,4.543547,4.637152,0.568457,4.691325
min,197000000000.0,1970.0,0.0,0.0,0.0,4.0,1.0,-53.154613,-86185900.0,1.0,...,-99.0,-99.0,-99.0,-99.0,1.0,-99.0,-9.0,-9.0,-9.0,-9.0
25%,199102100000.0,1991.0,4.0,8.0,0.0,78.0,5.0,11.510046,4.54564,1.0,...,0.0,0.0,-99.0,0.0,2.0,-99.0,-9.0,-9.0,0.0,-9.0
50%,200902200000.0,2009.0,6.0,15.0,0.0,98.0,6.0,31.467463,43.24651,1.0,...,15000.0,0.0,0.0,0.0,4.0,0.0,-9.0,-9.0,0.0,0.0
75%,201408100000.0,2014.0,9.0,23.0,0.0,160.0,10.0,34.685087,68.71033,1.0,...,400000.0,0.0,1273.412,0.0,7.0,1.0,0.0,0.0,0.0,0.0
max,201712300000.0,2017.0,12.0,31.0,1.0,1004.0,12.0,74.633553,179.3667,5.0,...,1000000000.0,132000000.0,275000000.0,48000.0,7.0,2769.0,1.0,1.0,1.0,1.0


### Preprocessing the dataset to get it into my desired form

Renaming the features in the dataset for more clarity

In [None]:
df = df.rename(columns = {'iyear': 'Year', 'imonth': 'Month', 'iday': 'Day', 'country_txt': 'Country', 
                          'region_txt': 'Region', 'provstate': 'State', 'city': 'City', 
                          'alternative_txt': 'Alternative', 'attacktype1_txt': 'Attack_Type', 
                          'targtype1_txt': 'Target_Type', 'gname': 'Group_Name', 'weaptype1_txt': 'Weapon_Type', 
                          'nwound': 'Wounded', 'nkill': 'Killed', 'target1': 'Target_Name'})

Now, features like region number, country number, etc are unnecassary. We can just encode them later if we want to incorporate a model. So, we take only the necessary features into consideration for the EDA.

In [None]:
df = df[['State', 'Region', 'City', 'latitude', 'longitude','Country',
         'Attack_Type','Year','Month','Day','Killed', 'Wounded', 'Target_Type',
         'Group_Name', 'Target_Name','Weapon_Type']]

Creating a new column 'Affected'. But need to take care of the 'nan' values in the 'Wounded' and 'Killed' columns.

In [None]:
df['Wounded'] = df['Wounded'].fillna(0).astype(int)
df['Killed'] = df['Killed'].fillna(0).astype(int)
df['Affected']=df['Killed']+df['Wounded']
df.head()

In [None]:
df_encoded = df.copy()

In [None]:
enc = LabelEncoder()

df_encoded['State'] = enc.fit_transform(df_encoded['State'])
df_encoded['Region'] = enc.fit_transform(df_encoded['Region'])
df_encoded['City'] = enc.fit_transform(df_encoded['City'])
df_encoded['Country'] = enc.fit_transform(df_encoded['Country'])
df_encoded['Attack_Type'] = enc.fit_transform(df_encoded['Attack_Type'])
df_encoded['Target_Type'] = enc.fit_transform(df_encoded['Target_Type'])
df_encoded['Group_Name'] = enc.fit_transform(df_encoded['Group_Name'])
df_encoded['Target_Name'] = enc.fit_transform(df_encoded['Target_Name'])
df_encoded['Weapon_Type'] = enc.fit_transform(df_encoded['Weapon_Type'])

#### Correlation Heatmap

In [None]:
plt.figure(figsize=(20, 15))
mask = np.triu(np.ones_like(df_encoded.corr().round(2), dtype=bool))
heatmap = sns.heatmap(df_encoded.corr().round(2), mask=mask, vmin=-1, vmax=1, annot=True, cmap=sns.color_palette("Spectral", as_cmap=True))
heatmap.set_title('Heatmap', fontdict={'fontsize':10}, pad=15);

Count of null values in the dataset

In [None]:
total_null = df.isnull().sum()
total_null

### Visualisating the data to gain insights

#### Year

In [None]:
# Yearwise count of terrorist attacks

plt.subplots(figsize=(15,6))
sns.countplot(data = df, x = 'Year', palette='rocket_r', edgecolor=sns.color_palette('rocket',7))
plt.xticks(rotation=90)
plt.title('Yearwise count of terrorist attacks')
plt.show()

#### Cities

In [None]:
# Most affected cities

plt.subplots(figsize=(15,6))
sns.barplot(x = df['City'].value_counts()[1:16].index, y = df['City'].value_counts()[1:16].values, palette='crest')
plt.xlabel('Cities')
plt.ylabel('Count')
plt.title('Most affected cities')
plt.xticks(rotation= 90)
plt.show()

#### Regions

In [None]:
regions = list(set(df.Region))
colors = ['lime', 'red', 'blue','green', 'purple', 'pink', 'orange', 'brown',
          'yellow','gold', 'black', 'grey']

In [None]:
plt.figure(figsize=(15,8))
m = basemap.Basemap(projection='mill', llcrnrlat=-80, urcrnrlat=80, llcrnrlon=-180, urcrnrlon=180, lat_ts=20, resolution='c')
m.drawcoastlines()
m.drawcountries()
m.fillcontinents(color='white',lake_color='lightblue', zorder = 1)
m.drawmapboundary(fill_color='lightblue')

def pltpoints(region, color = None, label = None):
    x, y = m(list(df.longitude[df.Region == region].astype("float")),
            (list(df.latitude[df.Region == region].astype("float"))))
    points = m.plot(x, y, "o", markersize = 4, color = color, label = label, alpha = .5)
    return(points)

for i, region in enumerate(regions):
    pltpoints(region, color = colors[i], label = region)  
    
plt.title("Global Terrorism (1970 - 2017)")
plt.legend(loc ='lower left', prop= {'size':11})
plt.show()    

In [None]:
# Regionwise count of killed, injured and affected

temp = df[['Region','Killed','Wounded','Affected']]
number_of_affected = temp.groupby(by = ['Region']).sum().reset_index().sort_values(by = ['Affected'], ascending = False)
number_of_affected

In [None]:
# Regionwise count of Deaths

pd.pivot_table(data=df, index=df.Year, columns='Region', values='Killed', aggfunc='sum').plot.line(figsize=(15,5), colormap='Paired').legend(title=None)
plt.ylabel('Deaths')
plt.title('Regionwise count of Deaths')

#### Countries

In [None]:
# 20 most countries affected

tempTwo = df[['Country','Killed','Wounded','Affected']]
affected_country = tempTwo.groupby(by = ['Country']).sum().reset_index().sort_values(by = ['Affected'], ascending = False)
affected_country[:20]

In [None]:
# Countrywise terrorism activities

top_ten_countries = df[df['Country'].isin(df['Country'].value_counts()[:10].index)]
pd.crosstab(top_ten_countries.Year, top_ten_countries.Country).plot(color = sns.color_palette('dark',4))
fig = plt.gcf()
fig.set_size_inches(18,6)
plt.ylabel('Count of Terrorism Activities')
plt.title('Countrywise terrorism activities')
plt.show()

In [None]:
# Most affected countries

plt.subplots(figsize=(15,6))
sns.barplot(x = df['Country'].value_counts()[:15].index, y = df['Country'].value_counts()[:15].values, palette='viridis')
plt.xlabel('Countries')
plt.ylabel('Count')
plt.title('Most affected Countries')
plt.xticks(rotation= 90)
plt.show()

In [None]:
# Countrywise count of attacks and death

count_terror = df['Country'].value_counts()[1:16].to_frame()
count_terror.columns = ['Attacks']
count_kill = df.groupby('Country')['Killed'].sum().to_frame()
count_terror.merge(count_kill, left_index = True, right_index = True, how = 'left').plot.bar(width=0.9)
fig = plt.gcf()
fig.set_size_inches(18,6)
plt.ylabel('Count')
plt.title('Countrywise count of attacks and death')
plt.show()

#### Groups

In [None]:
# Top terrorist groups

top_ten_groups = df[df['Group_Name'].isin(df['Group_Name'].value_counts()[1:11].index)]
pd.crosstab(top_ten_groups.Year,top_ten_groups.Group_Name).plot(color = sns.color_palette('Paired',10))
fig = plt.gcf()
fig.set_size_inches(18,6)
plt.show()

In [None]:
# Top 15 most active groups

sns.barplot(x = df['Group_Name'].value_counts()[1:16].values, y = df['Group_Name'].value_counts()[1:16].index, palette=('dark'))
plt.xticks(rotation=90)
fig = plt.gcf()
fig.set_size_inches(10,8)
plt.title('Terrorist groups with the highest activity.')
plt.show()

#### Attack Type

In [None]:
# Type of attack

plt.subplots(figsize = (15,6))
sns.countplot(data = df, x = 'Attack_Type', palette = 'muted', order = df['Attack_Type'].value_counts().index)
plt.xticks(rotation = 90)
plt.title('Type of attack carried out')
plt.show()

In [None]:
# Regionwise type of attacks

pd.crosstab(df.Region, df.Attack_Type).plot.barh(stacked = True, width = 1, color = sns.color_palette('Spectral',8))
fig = plt.gcf()
fig.set_size_inches(12,8)
plt.xlabel('Count')
plt.title('Regionwise type of attacks')
plt.show()

#### Target Type

In [None]:
# Type of target

plt.subplots(figsize = (15,6))
sns.countplot(data = df, x = 'Target_Type', palette = 'inferno', order = df['Target_Type'].value_counts().index)
plt.xticks(rotation = 90)
plt.title('Type of targets attacked')
plt.show()

#### Weapon Type

In [None]:
df["Weapon_Type"].value_counts()

In [None]:
# Type of weapons used to carry out the attack

plt.subplots(figsize = (15,6))
sns.countplot(data = df, x = 'Weapon_Type', palette = 'inferno', order = df['Weapon_Type'].value_counts().index)
plt.xticks(rotation = 90)
plt.title('Type of weapons used to carry out the attack')
plt.show()