In [3]:
from plotly import express as px, graph_objects as go

In [4]:
import pandas as pd

In [5]:
cleaned_data = pd.read_csv("data/cleaned/terror.csv")

## Observation avant exploration

In [44]:
# nombre de colonnes et de lignes
cleaned_data.shape

(180800, 50)

On a 49 variables et 181691 observations dans le jeu de données

In [9]:
# faisons en sorte que toutes colonnes sortent lors de l'affichage
pd.options.display.max_columns = cleaned_data.shape[1]

In [10]:
# regardons à quoi ressemble les dix premières lignes
cleaned_data.head()

Unnamed: 0,iyear,iday,imonth,nkill,country_txt,nwound,region_txt,provstate,city,nkillus,nwoundus,region_txt.1,latitude,longitude,attacktype1_txt,alternative_txt,suicide,ransompaid,nhostkid,hostkidoutcome_txt,ransomnote,nhours,ndays,ransompaidus,nhostkidus,summary,motive,gname,natlty1_txt,kidhijcountry,weaptype1_txt,weapsubtype1_txt,weaptype2_txt,weapsubtype2_txt,weaptype3_txt,weapsubtype3_txt,weaptype4_txt,weapsubtype4_txt,targtype1_txt,targsubtype1_txt,targtype2_txt,targsubtype2_txt,targtype3_txt,targsubtype3_txt,claimmode_txt,propextent_txt,propextent,propvalue,dbsource
0,1970,2,7,1.0,Dominican Republic,0.0,Central America & Caribbean,'',Santo Domingo,0.0,0.0,Central America & Caribbean,18.456792,-69.951164,Assassination,'',0,0.0,0.0,'','',0.0,0.0,0.0,0.0,'','',MANO-D,Dominican Republic,'',Unknown,'','','','','','','',Private Citizens & Property,Named Civilian,'','','','','','',0.0,0.0,PGIS
1,1970,0,0,0.0,Mexico,0.0,North America,Federal,Mexico city,0.0,0.0,North America,19.371887,-99.086624,Hostage Taking (Kidnapping),'',0,0.0,1.0,'','',0.0,0.0,0.0,0.0,'','',23rd of September Communist League,Belgium,Mexico,Unknown,'','','','','','','',Government (Diplomatic),"Diplomatic Personnel (outside of embassy, cons...",'','','','','','',0.0,0.0,PGIS
2,1970,0,1,1.0,Philippines,0.0,Southeast Asia,Tarlac,Unknown,0.0,0.0,Southeast Asia,15.478598,120.599741,Assassination,'',0,0.0,0.0,'','',0.0,0.0,0.0,0.0,'','',Unknown,United States,'',Unknown,'','','','','','','',Journalists & Media,Radio Journalist/Staff/Facility,'','','','','','',0.0,0.0,PGIS
3,1970,0,1,0.0,Greece,0.0,Western Europe,Attica,Athens,0.0,0.0,Western Europe,37.99749,23.762728,Bombing/Explosion,'',0,0.0,0.0,'','',0.0,0.0,0.0,0.0,'','',Unknown,United States,'',Explosives,Unknown Explosive Type,'','','','','','',Government (Diplomatic),Embassy/Consulate,'','','','','','',0.0,0.0,PGIS
4,1970,0,1,0.0,Japan,0.0,East Asia,Fukouka,Fukouka,0.0,0.0,East Asia,33.580412,130.396361,Facility/Infrastructure Attack,'',0,0.0,0.0,'','',0.0,0.0,0.0,0.0,'','',Unknown,United States,'',Incendiary,'','','','','','','',Government (Diplomatic),Embassy/Consulate,'','','','','','',0.0,0.0,PGIS


Ajoutons une nouvelle colonne date qui va nous permettre de classer directement par date les observations

In [13]:
# further we must see if the column iday, imonth and iyear can be concatenated
cleaned_data["iday"].unique()

array([ 2,  0,  1,  3,  6,  8,  9, 10, 11, 12, 13, 14, 15, 19, 20, 21, 22,
       25, 26, 27, 28, 30, 31,  4,  7, 16, 17, 18, 23, 24,  5, 29],
      dtype=int64)

In [15]:
cleaned_data["imonth"].unique()

array([ 7,  0,  1,  2,  3,  4,  5,  6,  8,  9, 10, 11, 12], dtype=int64)

In [16]:
cleaned_data["iyear"].unique()

array([1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980,
       1981, 1986, 1982, 1983, 1984, 1985, 1987, 1988, 1989, 1990, 1991,
       1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
       2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014,
       2015, 2016, 2017], dtype=int64)

In [17]:
cleaned_data[(cleaned_data["iday"] == 0)].index.__len__()

891

In [18]:
cleaned_data[(cleaned_data["imonth"] == 0)].index.__len__()

20

In [19]:
# Let's remove some lines by the indexes
cleaned_data.drop(index = cleaned_data[(cleaned_data["iday"] == 0)].index, axis = 0, inplace = True)

In [20]:
cleaned_data["iday"].unique()

array([ 2,  1,  3,  6,  8,  9, 10, 11, 12, 13, 14, 15, 19, 20, 21, 22, 25,
       26, 27, 28, 30, 31,  4,  7, 16, 17, 18, 23, 24,  5, 29],
      dtype=int64)

In [22]:
# create a new column named as date
cleaned_data['date'] = cleaned_data["iyear"].astype("str") + "-" + cleaned_data["imonth"].astype("str") + "-" + cleaned_data["iday"].astype("str")

In [23]:
cleaned_data["date"]

0           1970-7-2
5           1970-1-1
6           1970-1-2
7           1970-1-2
8           1970-1-2
             ...    
181686    2017-12-31
181687    2017-12-31
181688    2017-12-31
181689    2017-12-31
181690    2017-12-31
Name: date, Length: 180800, dtype: object

In [24]:
# transformation de la colonne en format date_time
cleaned_data["date"] = pd.to_datetime(cleaned_data['date'])

In [26]:
cleaned_data.sort_values('date', inplace=True)

In [27]:
cleaned_data["date"].dtype

dtype('<M8[ns]')

In [28]:
cleaned_data['date']

5        1970-01-01
10       1970-01-01
6        1970-01-02
7        1970-01-02
8        1970-01-02
            ...    
181644   2017-12-31
181645   2017-12-31
181653   2017-12-31
181676   2017-12-31
181690   2017-12-31
Name: date, Length: 180800, dtype: datetime64[ns]

In [30]:
cleaned_data.to_csv('data/cleaned/terror.csv', index=False)

# Visualiser le nombre de morts causés par des attaques terroristes par pays et par année

variables : nkill, country_txt, iyear

Récupération des données par pays et années et aggrégeons par morts

In [6]:
# créons un dataframe qui prend la colonne date comme index
cleaned_data_2 = cleaned_data.set_index('date')

In [7]:
cleaned_data_2.head()

Unnamed: 0_level_0,iyear,iday,imonth,nkill,country_txt,nwound,region_txt,provstate,city,nkillus,...,targsubtype1_txt,targtype2_txt,targsubtype2_txt,targtype3_txt,targsubtype3_txt,claimmode_txt,propextent_txt,propextent,propvalue,dbsource
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1970-01-01,1970,1,1,0.0,United States,0.0,North America,Illinois,Cairo,0.0,...,"Police Building (headquarters, station, school)",'','','','','',Minor (likely < $1 million),3.0,0.0,Hewitt Project
1970-01-01,1970,1,1,0.0,United States,0.0,North America,Wisconsin,Baraboo,0.0,...,Military Barracks/Base/Headquarters/Checkpost,'','','','','',Minor (likely < $1 million),3.0,0.0,PGIS
1970-01-02,1970,2,1,0.0,Uruguay,0.0,South America,Montevideo,Montevideo,0.0,...,Police Security Forces/Officers,'','','','','','',0.0,0.0,PGIS
1970-01-02,1970,2,1,0.0,United States,0.0,North America,California,Oakland,0.0,...,Electricity,'','','','','',Minor (likely < $1 million),3.0,22500.0,Hewitt Project
1970-01-02,1970,2,1,0.0,United States,0.0,North America,Wisconsin,Madison,0.0,...,Military Recruiting Station/Academy,'','','','',Letter,Minor (likely < $1 million),3.0,60000.0,Hewitt Project


In [8]:
df_pays_morts = cleaned_data_2.groupby(['country_txt', 'iyear'], as_index=False).sum('nkill')

In [12]:
# renommons les colonnes qui nous intéresse 
df_pays_morts.rename(columns={"nkill": "nombre de morts", "country_txt": "pays", "iyear": "années"}, inplace=True)

In [13]:
df_pays_morts.head()

Unnamed: 0,pays,années,iday,imonth,nombre de morts,nwound,nkillus,nwoundus,latitude,longitude,suicide,ransompaid,nhostkid,nhours,ndays,ransompaidus,nhostkidus,propextent,propvalue
0,Afghanistan,1973,1,5,0.0,1.0,0.0,0.0,34.516895,69.147011,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Afghanistan,1979,50,19,53.0,1.0,1.0,0.0,102.406239,199.759655,0,0.0,1.0,0.0,0.0,0.0,1.0,3.0,30000.0
2,Afghanistan,1987,31,5,0.0,2.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Afghanistan,1988,148,75,128.0,106.0,0.0,0.0,380.645164,759.021904,0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,2135000.0
4,Afghanistan,1989,168,61,10.0,84.0,0.0,0.0,341.883293,674.938996,0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,100000.0


In [14]:
px.line(df_pays_morts, x = "années", y = "nombre de morts", color = "pays")

Analyse : Nous remarquons quelques pics sur le graphique. Notemment durant les années suivantes :

- Entre 1980 et 1987 : Nous constatons que le terrorisme s'était plus profondément au niveau de certains pays d'Amérique du Sud comme El Salvador et Nicaragua (max morts = 3617);

- En 1989 : Sri lanka avec 1822 morts ;

- En 1994 : Rwanda avec 1571 morts ;

- En 1997 : Algérie avec 4188 morts ;

- En 2001 : Etats-Unis avec 3008 morts ;

- En 2007 : Irak avec 6667 morts, Nous constatons une hausse pendant les prochaines années ;

- En 2014 : Irak encore avec 13965 morts ;

- En 2016 : Irak avec 12207 morts ;

En tout l'Irak enregistre le plus grand nombre de morts.

## Visualiser le nombre de morts causés par des attaques terroristes par région et par année

In [23]:
# créons un dataframe pour le nombre de morts par region et annee
df_region_morts = cleaned_data_2.groupby(['region_txt', 'iyear'], as_index = False).sum('nkill') 

In [24]:
# renommons les colonnes qui nous intéresse 
df_region_morts.rename(columns={"nkill": "nombre de morts", "region_txt": "région", "iyear": "années"}, inplace=True)

In [25]:
df_region_morts.head()

Unnamed: 0,région,années,iday,imonth,nombre de morts,nwound,nkillus,nwoundus,latitude,longitude,suicide,ransompaid,nhostkid,nhours,ndays,ransompaidus,nhostkidus,propextent,propvalue
0,Australasia & Oceania,1970,21,10,0.0,0.0,0.0,0.0,-37.813187,144.96298,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Australasia & Oceania,1971,23,11,0.0,0.0,0.0,0.0,-33.873651,151.20689,0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,Australasia & Oceania,1972,175,75,1.0,0.0,1.0,0.0,-88.810686,457.256299,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Australasia & Oceania,1973,15,8,0.0,0.0,0.0,0.0,-43.532054,172.636225,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Australasia & Oceania,1974,24,12,0.0,0.0,0.0,0.0,-33.873651,151.20689,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
px.line(df_region_morts, x = "années", y = "nombre de morts", color = "région")