In [1]:
import pandas as pd
import numpy as np
import zipfile
import sys, os

%pylab inline

Populating the interactive namespace from numpy and matplotlib


## 载入并查看数据

首先，在 [Global Terrorism Database](https://www.kaggle.com/START-UMD/gtd)网页下载数据集，下面我们之间对其进行操作：

In [3]:
filename = '../data/globalterrorismdb_0718dist.csv_3.zip'

zf = zipfile.ZipFile(filename)

zf.extractall('./data/')    # 解压数据
zf.close()

gtd_file = os.listdir('data/')
gtd_file

['globalterrorismdb_0718dist.csv']

该文件是 `.csv` 格式，需要解决各种编码问题，有点不是那么友好：

In [5]:
%%time
df = pd.read_csv('data/globalterrorismdb_0718dist.csv', encoding='ISO-8859-1', low_memory=False)  # 载入数据

Wall time: 5.85 s


载入该文件花费时间约 $5.85$ 秒.

In [6]:
df.get_dtype_counts()  # 查看数据类型分布

float64    55
int64      22
object     58
dtype: int64

我们可以将其转换为 Excel (`.xlsx` 格式) ：

In [7]:
df.to_excel('data/globalterrorismdb_0718dist.xlsx')  # 保存为 excel 格式

In [8]:
%%time
df_ex = pd.read_excel('data/globalterrorismdb_0718dist.xlsx')

Wall time: 1min 34s


载入该 excel 文件花费 `1min 34s` 也不是我们想要的结果.

为了加快数据的存储和载入, 我们采用 HDF5 格式来保存文件，但是，`pytables` 暂时不支持直接存储 `object` 数据类型的数据，为此，需要对数据类型进行转换:

In [10]:
df_ex.get_dtype_counts()

float64    55
int64      22
object     58
dtype: int64

In [None]:
gtd_df = df.select_dtypes(['object']).

gtd_df1 = gtd_df.select_dtypes(['int64', 'float64'])

gtd = pd.concat([gtd_df1, gtd_df], sort=True)

In [None]:
# Display a summary of the data frame
df.info(verbose = True)

In [None]:
# Check the number of missing values in each attribute
count = df.isnull().sum()
percent = round(count / df.shape[0] * 100, 2)
series = [count, percent]
result = pd.concat(series, axis=1, keys=['Count','Percent'])
result.sort_values(by='Count', ascending=False)  # 按缺失值数量倒序

In [None]:
result['Percent'].plot()

从上图可以看出, 缺失值比例超过 $90\%$ 的特征并不多

In [None]:
target_attrs = result[result['Percent'] < 90.0]
print('留下来的特征所占比例:', round(target_attrs.shape[0]/result.shape[0], 2))

查看留下来的特征:

In [None]:
keep_attrs = target_attrs.index.values
keep_attrs

In [None]:
target_attrs.sort_values(by='Count', ascending=False)  # 按缺失值数量倒序

查看缺失值比例在 $50\% \sim 90\%$ 的特征:

In [None]:
df_50_90 = target_attrs[target_attrs['Percent'] > 50.0]
round(df_50_90.shape[0]/target_attrs.shape[0], 2)  # 特征数所占比例

占比是 $19\%$, 因而为了简化问题, 我们仅仅考虑缺失值占比小于 $50\%$ 的特征.

In [None]:
keep_attrs = target_attrs[target_attrs['Percent']<50.0]

subset_df = df.loc[:, keep_attrs.index.values]
subset_df.info(verbose = True)

In [None]:
# total no of columns and rows present in data
print("数据的尺寸:",df.shape)

In [None]:
feature_names = set(subset_df.columns)

# 数值变量集合
num_feature_names = {
    'nperps', 'nperpcap', 'nkill', 'nkillus', 'nkillter', 'nwound',
    'nwoundus', 'nwoundte', 'propvalue', 'nhostkid', 'nhostkidus',
    'nhours', 'ndays', 'ransomamt', 'ransomamtus', 'ransompaid','ransompaidus', 'nreleased'
}
exta_names = {'eventid','iday','imonth','iyear','latitude', 'longitude'}

txt_names = set(subset_df.select_dtypes(['object']).columns)
txt_names.update({'city', 'summary', 'provstate', 'corp1', 'target1', 'gname','scite1', 'scite2', 'dbsource'})

cat_names = feature_names - num_feature_names - exta_names - txt_names

## 类别信息处理

In [None]:
subset_df.loc[subset_df['doubtterr'] == -9, 'doubtterr'] = -1  # -9 未知

#subset_df['attacktype1'].replace(9, -1)
subset_df.loc[subset_df['attacktype1'] == 9, 'attacktype1'] = -1  #  9 未知

subset_df.loc[subset_df['weaptype1'] == 13, 'weaptype1'] = -1 # 13 未知

subset_df.loc[subset_df['targtype1'] == 20, 'targtype1'] = -1 # 13 未知

subset_df.loc[subset_df['property'] == -9, 'property'] = -1

subset_df['ishostkid'].fillna(-1, inplace=True)
subset_df.loc[subset_df['ishostkid'] == -9, 'ishostkid'] = -1

subset_df.loc[subset_df['INT_LOG'] == -9, 'INT_LOG'] = -1

subset_df.loc[subset_df['INT_IDEO'] == -9, 'INT_IDEO'] = -1

subset_df.loc[subset_df['INT_MISC'] == -9, 'INT_MISC'] = -1

subset_df.loc[subset_df['INT_ANY'] == -9, 'INT_ANY'] = -1

subset_df['claimed'].fillna(-1, inplace=True)
subset_df.loc[subset_df['claimed'] == -9, 'claimed'] = -1

subset_df['specificity'].fillna(-1, inplace=True)

subset_df.loc[subset_df['vicinity'] == -9, 'vicinity'] = -1

subset_df['guncertain1'].fillna(-1, inplace=True)
subset_df['weapsubtype1'].fillna(-1, inplace=True)
subset_df['targsubtype1'].fillna(-1, inplace=True)
subset_df['natlty1'].fillna(-1, inplace=True)
subset_df['doubtterr'].fillna(-1, inplace=True)
subset_df['multiple'].fillna(-1, inplace=True)

In [None]:
# Iterate over each target attribute and map it
for att in cat_names:
    subset_df[att] = subset_df[att].replace('Unknown',-1)

cat_df = subset_df[list(cat_names)].astype('int').astype('category')

In [None]:
np.unique(subset_df['claimed'])

In [None]:
np.unique(cat_df.isnull().sum()) # 无缺失值

In [None]:
# 保存到本地
cat_df.to_csv('./data/cat.csv')

In [None]:
cat_df.get_dtype_counts()

In [None]:
cat_df['specificity'][:10]

## 文本信息处理

In [None]:
np.unique(subset_df['targtype1_txt'])

In [None]:
subset_df['targsubtype1_txt'].fillna('Unknown', inplace=True)
subset_df['natlty1_txt'].fillna('Unknown', inplace=True)
subset_df['weapsubtype1_txt'].fillna('Unknown', inplace=True)
subset_df['provstate'].fillna('Unknown', inplace=True)
subset_df['city'].fillna('Unknown', inplace=True)
subset_df['summary'].fillna('Unknown', inplace=True)
subset_df['corp1'].fillna('Unknown', inplace=True)
subset_df['target1'].fillna('Unknown', inplace=True)
subset_df['scite1'].fillna('Unknown', inplace=True)

In [None]:
txt_df = subset_df[list(txt_names)]
np.unique(txt_df.isnull().sum())

In [None]:
txt_df.isnull().sum()

In [None]:
subset_df['scite2'].fillna('Unknown', inplace=True)

In [None]:
txt_df = subset_df[list(txt_names)]
np.unique(txt_df.isnull().sum())

In [None]:
txt_df.to_json('./data/txt.json')

In [None]:
a = pd.read_json('./data/txt.json')

## 数值型变量

In [None]:
np.unique(subset_df['nperpcap'])

In [None]:
subset_df.loc[subset_df['nperpcap'] == -9, 'nperpcap'] = np.nan
subset_df.loc[subset_df['nperpcap'] == -99, 'nperpcap'] = np.nan
subset_df.loc[subset_df['nperps'] == -9, 'nperps'] = np.nan
subset_df.loc[subset_df['nperps'] == -99, 'nperps'] = np.nan

In [None]:
feature_names & num_feature_names

In [None]:
cal_df = subset_df[list(feature_names & num_feature_names)]

cal_df.get_dtype_counts()

## 处理二值变量

In [None]:
# Map the codes to labels
ynu_map = {1: 'Yes', 0: 'No', -1: 'Unknown'}

# List of target attributes to map
ynu_attrs =['extended', 'vicinity', 'crit1', 'crit2', 'crit3', 'doubtterr', 'multiple', 
            'success', 'suicide', 'guncertain1', 'individual', 'claimed', 'property', 
            'ishostkid', 'INT_LOG', 'INT_IDEO', 'INT_MISC', 'INT_ANY']

# Iterate over each target attribute and map it
for att in ynu_attrs:
    att_txt = att + '_txt'
    subset_df[att_txt] = subset_df[att].map(ynu_map)

# Get the list of attributes, dropping the coded for labeled attributes
final_attrs = []

for attr in subset_df.columns.values:
    if attr not in ynu_attrs:
        final_attrs.append(attr)
        
subset_df2 = subset_df.loc[:, final_attrs]
subset_df2.info(verbose = True)

In [None]:
subset_df2.to_excel("./data/gtd_preprocessed.xlsx")

# [Code Book](https://isstd.org/gtd-book.html)

Python interprets the data types differently than the code book provided by START.

|ATTRIBUTE|PYTHON DTYPE|CODE BOOK TYPE|DEFINITION|
|:----------------|:--------|:------------|:---------------------------------------|
|eventid|int64|Numeric|12-digit Event ID system. First 8 numbers - date recorded "yyyymmdd". Last 4 numbers - sequential case number for the given day|
|iyear|int64|Numeric|The year in which the incident occurred|
|imonth|int64|Numeric|The month in which the incident occurred.  When the exact month of the incident is unknown, this will be recorded as "0".|
|iday|int64|Numeric|The numeric day of the month on which the incident occurred.  When the exact day of the incident is unknown, the field is recorded as "0".|
|extended|object|Categorical|The duration of an incident extended more than 24 hours.  1 = YES, 0 = NO|
|country_txt|object|Categorical|Identifies the country or location where the incident occurred.  When incident occurred cannot be identified, it is coded as "Unknown.|
|region_txt|object|Categorical|Identifies the region in which the incident occurred, and divided into 1 of 12 categories|
|provstate|object|Text|The name (at the time of event) of the 1st order subnational administrative region in which the event occurs|
|city|object|Text|The name of the city, village, or town in which the incident occurred.  If unknown, then this field contains the smallest administrative area below provstate|
|latitude|float64|Numeric|The latitude (based on WGS1984 standards) of the city in which the event occurred|
|longitude|float64|Numeric|The longitude (based on WGS1984 standards) of the city in which the event occurred.|
|specificity|float64|Categorical|Identifies the geospatial resolution of the latitude and longitude fields. 1 to 5|
|vicinity|object|Categorical|1 = YES, The incident occurred in the immediate vicinity of the city in question.  0 = NO, The incident in the city itself.|
|summary|object|Text|A brief narrative summary of the incident, noting the "when, where, who, what, how, and why.|
|crit1|object|Categorical|The violent act must be aimed at attaining a political, economic, religious, or social goal. 1 = YES, 0 = NO|
|crit2|object|Categorical|There must be evidence of an intention to coerce, intimidate, or convey some other message to a larger audience than the immediate victims. 1 = YES, 0 = NO|
|crit3|object|Categorical|The action is outside the context of legitimate warfare activities, insofar as it targets non-combatants.  1 = YES, 0 = NO|
|doubtterr|object|Categorical|There is doubt as to whether the incident is an act of terrorism.  1 = YES, 0 = NO|
|multiple|object|Categorical|Denote that the particular attack was part of a "multiple" incident.  1 = YES, 0 = NO|
|success|object|Categorical|A successful attack depends on the type of attack. The key question is whether or not the attack type took place.  1 = YES, 0 = NO|
|suicide|object|Categorical|Coded "Yes" in those cases where there is evidence that the perpetrator did not intend to escape from the attack alive. 1 = YES, 0 = NO|
|attacktype1_txt|object|Categorical|The general method of attack and often reflects the broad class of tactics used. 9 categories|
|targtype1_txt|object|Categorical|The general type of target/victim.  22 categories|
|targsubtype1_txt|object|Categorical|The more specific target category and provides the next level of designation for each target type. If a target subtype is not applicable this variable is left blank|
|corp1|object|Text|The corporate entity or government agency that was targeted|
|target1|object|Text|The specific person, building, installation, etc., that was targeted and/or victimized|
|natlty1_txt|object|Categorical|The nationality of the target that was attacked.  For hijacking incidents, the nationality of the plane is recorded|
|gname|object|Text|The name of the group that carried out the attack|
|guncertain1|object|Categorical|Indicates whether or not the information reported about the Perpetrator Group Name(s) is based on speculation or dubious claims of responsibility.  1 = YES, 0 = NO|
|individual|object|Categorical|Indicates whether or not the attack was carried out by an individual or several individuals not known to be affiliated with a group or organization. 1 = YES, 0 = NO|
|nperpcap|float64|Numeric|The number of perpetrators taken into custody. "-99" or "Unknown" appears when there is evidence of captured, but the number is not reported|
|claimed|object|Categorical|Indicates whether a group or person(s) claimed responsibility for the attack.  1 = YES, 0 = NO|
|weaptype1_txt|object|Categorical|Records the general type of weapon used in the incident.  Up to four weapon types are recorded for each incident|
|weapsubtype1_txt|object|Categorical|A more specific value for most of the Weapon Types identified|
|nkill|float64|Numeric|Total confirmed fatalities for the incident|
|nkillus|float64|Numeric|The number of U.S. citizens who died as a result of the incident|
|nkillter|float64|Numeric|Limited to only perpetrator fatalities|
|nwound|float64|Numeric|The number of confirmed non-fatal injuries to both perpetrators and victims|
|nwoundus|float64|Numeric|The number of confirmed non-fatal injuries to U.S. citizens, both perpetrators and victims|
|nwoundte|float64|Numeric|Number of Perpetrators Injured|
|property|object|Categorical|There is evidence of property damage from the incident.  1 = YES, 0 = NO|
|ishostkid|object|Categorical|Whether or not the victims were taken hostage or kidnapped during an incident. 1 = YES, 0 = NO|
|scite1|object|Text|Cites the first source that was used to compile information on the specific incident|
|dbsource|object|Text|Identifies the original data collection effort in which each event was recorded|
|INT_LOG|object|Categorical|It indicates whether a perpetrator group crossed a border to carry out an attack (logistically international).  1 = YES, 0 = NO, -9=UNKNOWN|
|INT_IDEO|object|Categorical|It indicates whether a perpetrator group attacked a target of a different nationality (ideologically international). 1 = YES, 0 = NO, -9=UNKNOWN|
|INT_MISC|object|Categorical|It indicates whether a perpetrator group attacked a target of a different nationality (not clear if logistically or ideologically international) 1 = YES, 0 = NO, -9=UNKNOWN|
|INT_ANY|object|Categorical|The attack was international on any of the dimensions.  1 = YES, 0 = NO, -9=UNKNOWN|


**NOTE 1:** *For categorical variables, -9 was replaced with -1 per a recommendation from Dr. George at Regis University.*

**NOTE 2:** *For attributes containing 1, 0, and -1, they were replaced with a labeled version of the attribute and can be identified by a `_txt` suffix in the data frame.*