In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%pylab inline
# Set the Seaborn default style for plots
sns.set()

# Set the color palette
sns.set_palette(sns.color_palette("muted"))

Populating the interactive namespace from numpy and matplotlib


### 载入数据

In [2]:
# Load the preprocessed GTD dataset
gtd_df = pd.read_excel('./data/gtd_preprocessed.xlsx')

In [3]:
import pickle

with open('./data/gtd_names.json', 'rb') as fp:
    name_dict = pickle.load(fp)  # 载入特征名称

for name in name_dict:
    print(name, len(name_dict[name]))

class Bunch(dict):
    '''
    建立树型结构
    '''
    def __init__(self, *args, **kwds):
        super().__init__(*args, **kwds)
        self.__dict__ = self

names = Bunch(name_dict)

exta_names 6
cat_names 27
txt_names 17
cal_names 7


### 查看数据结构

这里有 $57$ 个特征, $113\,175$ 个样本.

In [4]:
# Display a summary of the data frame
gtd_df.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113175 entries, 0 to 114182
Data columns (total 57 columns):
eventid             113175 non-null int64
iyear               113175 non-null int64
imonth              113175 non-null int64
iday                113175 non-null int64
extended            113175 non-null int64
country             113175 non-null int64
country_txt         113175 non-null object
region              113175 non-null int64
region_txt          113175 non-null object
provstate           113175 non-null object
city                113175 non-null object
latitude            113175 non-null float64
longitude           113175 non-null float64
specificity         113175 non-null int64
vicinity            113175 non-null int64
summary             113175 non-null object
crit1               113175 non-null int64
crit2               113175 non-null int64
crit3               113175 non-null int64
doubtterr           113175 non-null int64
multiple            113175 non-null int6

In [5]:
gtd_df.get_dtype_counts()

float64     9
int64      31
object     17
dtype: int64

### 缩短长文本变量的取值

The following weapon type category cause the plots to display incorrectly due to the length.

In [6]:
np.unique(gtd_df['attacktype1_txt'])

array(['Armed Assault', 'Assassination', 'Bombing/Explosion',
       'Facility/Infrastructure Attack', 'Hijacking',
       'Hostage Taking (Barricade Incident)',
       'Hostage Taking (Kidnapping)', 'Unarmed Assault', 'Unknown'],
      dtype=object)

In [7]:
gtd_df.loc[gtd_df['weaptype1_txt'] == 
           'Vehicle (not to include vehicle-borne explosives, i.e., car or truck bombs)', 
           'weaptype1_txt'] = 'Vehicle (non-explosives)'

gtd_df.loc[gtd_df['attacktype1_txt'] == 
           'Hostage Taking (Barricade Incident)', 
           'attacktype1_txt'] = 'Hostage Taking (Barricade)'

### 处理可进行距离运算的变量

假设所有缺失值的出现代表其不是恐怖事件.

In [8]:
np.unique(gtd_df.isnull().sum())

array([   0,  335,  552, 2623, 4154, 4182, 4432, 8067], dtype=int64)

In [10]:
gtd_df[list(names.cal_names)].head()

Unnamed: 0,nkillter,nwound,nkillus,nperpcap,nwoundus,nwoundte,nkill
0,100.0,6.0,0.0,,0.0,,104.0
1,0.0,3.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# Function to impute either the median or mean
def fill_value(attr):
    fill = 0.0
    threshold = 3
    attr_clean = attr.dropna()
    attr_std = attr_clean.std()
    outliers = attr_clean[attr_clean > (threshold * attr_std)]  # 异常值处理
    
    if (outliers.count() > 0):
        fill = attr_clean.median()
    else:
        fill = attr_clean.mean()
    
    return fill

In [18]:
# Impute each of the numeric attributes that contain missing values
for attr in names.cal_names:
    gtd_df[attr] = gtd_df[attr].fillna(fill_value(gtd_df[attr]))

In [19]:
gtd_df[list(names.cal_names)].head()

Unnamed: 0,nkillter,nwound,nkillus,nperpcap,nwoundus,nwoundte,nkill
0,100.0,6.0,0.0,0.0,0.0,0.0,104.0
1,0.0,3.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [20]:
gtd_df[list(names.cal_names)].isnull().sum()

nkillter    0
nwound      0
nkillus     0
nperpcap    0
nwoundus    0
nwoundte    0
nkill       0
dtype: int64

### 转换分类变量为 Categorical 类型

Convert a subset of the data frame attributes to categorical to align with the GTD code book. Converting attributes to categorical reduces memory requirements and notifies other libraries to treat the attribute accordingly (Pandas, n.d.).  For the GTD data frame, it results in a percent decrease of 36.5%.

In [11]:
gtd_df.get_dtype_counts()

float64     9
int64      31
object     17
dtype: int64

In [12]:
for cat in names.cat_names:
    gtd_df[cat] = gtd_df[cat].astype('category')
    
gtd_df.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113175 entries, 0 to 114182
Data columns (total 57 columns):
eventid             113175 non-null int64
iyear               113175 non-null int64
imonth              113175 non-null int64
iday                113175 non-null int64
extended            113175 non-null category
country             113175 non-null category
country_txt         113175 non-null object
region              113175 non-null category
region_txt          113175 non-null object
provstate           113175 non-null object
city                113175 non-null object
latitude            113175 non-null float64
longitude           113175 non-null float64
specificity         113175 non-null category
vicinity            113175 non-null category
summary             113175 non-null object
crit1               113175 non-null category
crit2               113175 non-null category
crit3               113175 non-null category
doubtterr           113175 non-null category
multiple     

In [13]:
gtd_df.get_dtype_counts()

category    27
float64      9
int64        4
object      17
dtype: int64

### Summary Statistics
Each of the numeric attributes contain missing values ranging between 3.36% and 11.92%.  The following table depicts the summary statistics prior to imputation.

In [14]:
gtd_df[list(names.cal_names)][gtd_df[list(names.cal_names)] > -1].describe(
    percentiles = [0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 1.0]).transpose()

Unnamed: 0,count,mean,std,min,10%,20%,30%,40%,50%,60%,70%,80%,90%,100%,max
nkillter,110552.0,0.507426,4.232247,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,500.0,500.0
nwound,105108.0,3.803507,40.619574,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,4.0,8.0,8191.0,8191.0
nkillus,112840.0,0.036875,5.691986,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1360.0,1360.0
nperpcap,109021.0,0.120069,2.058576,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,406.0,406.0
nwoundus,112623.0,0.015139,0.715459,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,151.0,151.0
nwoundte,108743.0,0.108421,1.504903,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,200.0,200.0
nkill,108993.0,2.539227,12.350006,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,3.0,5.0,1570.0,1570.0


### Imputation

使用 $3\sigma$ 原则处理异常值.  Since the mean is not robust and is affected by outliers, the median is recommended for imputation (Chen, n.d.).  The following function imputes the median if an attribute contains outliers, otherwise the attribute mean is imputed. 

In [15]:
np.unique(gtd_df[list(names.txt_names)].isin(['Unknown']))  # 文本变量中存在缺失值

array([False,  True])

### Summary Statistics with Imputation
The following table presents the summary statistics after imputation.

In [21]:
gtd_df[list(names.cal_names)].describe(
    percentiles=[0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 1.0]).transpose()

Unnamed: 0,count,mean,std,min,10%,20%,30%,40%,50%,60%,70%,80%,90%,100%,max
nkillter,113175.0,0.495666,4.183611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,500.0,500.0
nwound,113175.0,3.532397,39.157371,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,4.0,7.0,8191.0,8191.0
nkillus,113175.0,0.036766,5.683556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1360.0,1360.0
nperpcap,113175.0,0.115662,2.020569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,406.0,406.0
nwoundus,113175.0,0.015065,0.713713,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,151.0,151.0
nwoundte,113175.0,0.104175,1.475292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,200.0,200.0
nkill,113175.0,2.48235,12.123158,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,5.0,1570.0,1570.0


### Data Frame Review
The new data frame now has a state that only includes missing values for the latitude and longitude attributes.  There are 1,407 observations or 1.25% of the total observations, which contain missing values.

In [22]:
gtd_df.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113175 entries, 0 to 114182
Data columns (total 57 columns):
eventid             113175 non-null int64
iyear               113175 non-null int64
imonth              113175 non-null int64
iday                113175 non-null int64
extended            113175 non-null category
country             113175 non-null category
country_txt         113175 non-null object
region              113175 non-null category
region_txt          113175 non-null object
provstate           113175 non-null object
city                113175 non-null object
latitude            113175 non-null float64
longitude           113175 non-null float64
specificity         113175 non-null category
vicinity            113175 non-null category
summary             113175 non-null object
crit1               113175 non-null category
crit2               113175 non-null category
crit3               113175 non-null category
doubtterr           113175 non-null category
multiple     

### Incident Date
Concatinate the `iyear`, `imonth` and `iday` attributes to create an incident date.

In [23]:
# 297 iday attributes contain 0 to represent unknown, setting 1
gtd_df.loc[gtd_df['iday'] == 0, 'iday'] = 1

gtd_df['incident_date'] = (gtd_df['iyear'].astype(str) + '-' + 
                              gtd_df['imonth'].astype(str) + '-' + 
                              gtd_df['iday'].astype(str))

gtd_df['incident_date'] = pd.to_datetime(gtd_df['incident_date'], 
                                            format="%Y-%m-%d")
gtd_df.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113175 entries, 0 to 114182
Data columns (total 58 columns):
eventid             113175 non-null int64
iyear               113175 non-null int64
imonth              113175 non-null int64
iday                113175 non-null int64
extended            113175 non-null category
country             113175 non-null category
country_txt         113175 non-null object
region              113175 non-null category
region_txt          113175 non-null object
provstate           113175 non-null object
city                113175 non-null object
latitude            113175 non-null float64
longitude           113175 non-null float64
specificity         113175 non-null category
vicinity            113175 non-null category
summary             113175 non-null object
crit1               113175 non-null category
crit2               113175 non-null category
crit3               113175 non-null category
doubtterr           113175 non-null category
multiple     

### Save the Cleansed Data
Output the new data frame to a CSV file for use in the predictive models.

In [25]:
np.unique(gtd_df.isnull().sum())

array([0], dtype=int64)

In [26]:
gtd_df.to_excel("./data/gtd_clean.xlsx")

In [27]:
gtd_df.get_dtype_counts()

category          27
datetime64[ns]     1
float64            9
int64              4
object            17
dtype: int64

### References

Bois, J. (n.d.). Plotting all of your data: Empirical cumulative distribution functions Retrieved from https://campus.datacamp.com/courses/statistical-thinking-in-python-part-1/graphical-exploratory-data-analysis?ex=11

Chen, D. (n.d.). Duplicate and missing data Retrieved from https://campus.datacamp.com/courses/cleaning-data-in-python/cleaning-data-for-analysis?ex=11

Pandas. (n.d.). Categorical data. Retrieved from https://pandas.pydata.org/pandas-docs/stable/categorical.html