In [1]:
import pandas as pd
import numpy as np

%pylab inline

gtd_df = pd.read_csv('data/globalterrorismdb_0718dist.csv', encoding='ISO-8859-1', low_memory=False)  # 载入数据

gtd_df.info() # 在对数据进行处理之前应该先查看加载数据的相关信息

Populating the interactive namespace from numpy and matplotlib
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181691 entries, 0 to 181690
Columns: 135 entries, eventid to related
dtypes: float64(55), int64(22), object(58)
memory usage: 187.1+ MB


# 1997 年之前的数据处理

由于一些历史遗留问题，目前只有在 1997 年后发生的事件才是系统可用的变量: `summary`, `doubtterr`, `alternative`, `multiple`, `related`, `individual`, `nperpcap`, `claimed`, `claimmode`, `compclaim`, `claim2`, `claimmode2`, `claim3`, `claimmode3`, `motive`, `ransomnote`, `addnotes`, `scite1`, `scite2`, `scite3`, 故而，我们需要将数据集划分为 1997 年前后的数据:

In [2]:
le_set = {
    'summary',
    'doubtterr',
    'alternative',
    'multiple',
    'related',
    'individual',
    'nperpcap',
    'claimed',
    'claimmode',
    'compclaim',
    'claim2',
    'claimmode2',
    'claim3',
    'claimmode3',
    'motive',
    'ransomnote',
    'addnotes',
    'scite1',
    'scite2',
    'scite3'
}

In [3]:
names = set(gtd_df.columns)
leV = names - le_set

gtd_1997 = gtd_df.query('iyear<1998')[list(leV)]

In [4]:
gtd_1997.shape

(67507, 115)

In [5]:
count_1997 = gtd_1997.isnull().sum()
percent = round(count_1997 / gtd_1997.shape[0] * 100, 2)
series = [count_1997, percent]
result = pd.concat(series, axis=1, keys=['Count','Percent'])
result.sort_values(by='Count', ascending=False)  # 按缺失值数量倒序

Unnamed: 0,Count,Percent
gsubname3,67507,100.00
ransomamtus,67506,100.00
attacktype3,67504,100.00
attacktype3_txt,67504,100.00
ransompaidus,67502,99.99
gsubname2,67501,99.99
claimmode3_txt,67499,99.99
gname3,67496,99.98
guncertain3,67496,99.98
claimmode2_txt,67488,99.97


In [6]:
target_attrs = result[result['Percent']<99]
print('特征所占比例:', round(target_attrs.shape[0]/result.shape[0], 2))

特征所占比例: 0.66


缺失值占比超过 $99\%$ 占比达 $33\%$。故而，我们可以直接将其丢弃。

In [7]:
df_50_90 = target_attrs[50<target_attrs['Percent']]
round(df_50_90.shape[0]/target_attrs.shape[0], 2)  # 特征数所占比例

0.38

缺失值所占比例并不多，故而我们仅仅取缺失值占比小于 $50$，且下面主要看看 1997 年前的数据的 text 变量:

In [8]:
gtd_1997 = gtd_1997[result[result['Percent']<50].index.tolist()]

In [9]:
np.unique(gtd_1997.select_dtypes('object').isin(['Unknown']))

array([False,  True])

`gtd_1997` 中存在值 `'Unknown'`，故而，我们将文本变量的所有缺失值替换为 `'Unknown'`。

In [10]:
txt_index_1997 = gtd_1997.select_dtypes('object').columns.tolist()

gtd_1997[txt_index_1997] = gtd_1997[txt_index_1997].replace(np.nan, 'Unknown')

np.unique(gtd_1997[txt_index_1997].isnull().sum())  # 文本变量的缺失值填充完毕

array([0], dtype=int64)

In [11]:
gtd_1997.isnull().sum()

natlty1               433
country                 0
ransom                387
corp1                   0
weapdetail              0
gname                   0
target1                 0
INT_IDEO                0
weapsubtype1_txt        0
targtype1               0
guncertain1             0
targtype1_txt           0
ishostkid             175
weapsubtype1        11677
crit3                   0
latitude             3622
provstate               0
dbsource                0
country_txt             0
INT_MISC                0
nwound               8079
crit1                   0
attacktype1_txt         0
city                    0
attacktype1             0
targsubtype1_txt        0
property                0
targsubtype1         3340
success                 0
extended                0
nkill                6033
vicinity                0
natlty1_txt             0
longitude            3623
weaptype1_txt           0
eventid                 0
imonth                  0
iday                    0
region_txt  

In [12]:
gtd_1997.to_hdf('./data/gtd_1997.h5', 'gtd_1997')

# 1997 年之后的数据处理

In [13]:
df = pd.read_csv('data/globalterrorismdb_0718dist.csv', encoding='ISO-8859-1', low_memory=False)  # 载入数据

In [14]:
df.shape

(181691, 135)

In [15]:
gtd_df = df.query('iyear>1997')

gtd_df

Unnamed: 0,eventid,iyear,imonth,iday,approxdate,extended,resolution,country,country_txt,region,...,addnotes,scite1,scite2,scite3,dbsource,INT_LOG,INT_IDEO,INT_MISC,INT_ANY,related
67507,199801010001,1998,1,1,,0,,34,Burundi,11,...,,"Burundi Rebels, Ex-Rwandan Army Soldiers Blam...",Burundi--Attack Reported on Bujumbura Airport...,,CETIS,0,1,0,1,
67508,199801010002,1998,1,1,,0,,167,Russia,9,...,,"Bomb injures 3 in Moscow subway system, The ...","Bomb injures 3 in Moscow subway, Charleston ...","Bomb Injures 3 Workers in Moscow Metro, Los ...",CETIS,-9,-9,0,-9,
67509,199801010003,1998,1,1,,0,,603,United Kingdom,8,...,,Protestant gunmen kill Catholic in New Year's...,Ulster Peace Shattered by Shooting: Catholic ...,,CETIS,0,0,1,1,
67510,199801020001,1998,1,2,,0,,95,Iraq,10,...,,Iraq Condemns Attack on UNSCOM Baghdad Office...,"Farouk Choukri , Iraq, UN Officials Continue ...","Iraqi Interior Minister on UNSCOM Attack, Kuw...",CETIS,-9,-9,1,1,
67511,199801020002,1998,1,2,,0,,155,West Bank and Gaza Strip,10,...,,"Woman Shot, The Philadelphia Inquirer, Janua...",Israeli Woman Critically Hurt by Gunfire in W...,,CETIS,-9,-9,0,-9,
67512,199801040001,1998,1,4,,0,,118,Macedonia,9,...,,Officials Rule Out UCK Responsibility for Bom...,Kosovo Liberation Army Repeats Claim to FYROM...,"Ljubisa Stankovic, Prosecutor's Office Sees '...",CETIS,1,1,0,1,"199801040001, 199801040002"
67513,199801040002,1998,1,4,,0,,118,Macedonia,9,...,,Officials Rule Out UCK Responsibility for Bom...,Kosovo Liberation Army Repeats Claim to FYROM...,"Ljubisa Stankovic, Prosecutor's Office Sees '...",CETIS,1,1,0,1,"199801040002, 199801040001"
67514,199801050001,1998,1,5,,0,,15,Austria,8,...,,"J. Wolf, E. Holzer and E. Bieber, Technical C...","Ernst Bieber, Peter Grolig, and Wilhelm Theure...",,CETIS,-9,-9,0,-9,
67515,199801050002,1998,1,5,,0,,86,Guyana,3,...,,"Urell Wilkinson, Two Bombs Explode in Front o...","Third Explosion in 36 Hours in Georgetown, E...",Manager Reports Hand Grenade Caused Explosion...,CETIS,-9,-9,0,-9,"199801050002, 199801060002"
67516,199801050003,1998,1,5,,0,,168,Rwanda,11,...,,Rwanda--Forty Said Murdered in Gitarama Prefe...,Death Toll In Central Rwanda Rebel Attacks Ri...,,CETIS,0,1,0,1,"199801050003, 199801050004"


In [16]:
# total no of columns and rows present in data
print("数据的尺寸:",gtd_df.shape)

# Check the number of missing values in each attribute
count = gtd_df.isnull().sum()
percent = round(count / gtd_df.shape[0] * 100, 2)
series = [count, percent]
result = pd.concat(series, axis=1, keys=['Count','Percent'])
result.sort_values(by='Count', ascending=False)  # 按缺失值数量倒序

数据的尺寸: (114184, 135)


Unnamed: 0,Count,Percent
weaptype4,114179,100.00
weaptype4_txt,114179,100.00
weapsubtype4,114178,99.99
weapsubtype4_txt,114178,99.99
gsubname3,114164,99.98
claimmode3,114059,99.89
claimmode3_txt,114059,99.89
gsubname2,114030,99.87
divert,114027,99.86
kidhijcountry,113972,99.81


也可:
    
```py
# Removing columns which has 80% null values
def remove_columns_missing_values(data, min_threshold):
    for col in data.columns:
        rate = data[col].isnull().sum()/float(len(data)) * 100
        if rate >= min_threshold:
            data = data.drop(col, 1)
    return data

data = remove_columns_missing_values(df , 80)
print("可以得到的特征 :",len(data.columns))
```

Pandas 中进行数据类型转换有三种基本方法：

- 使用 `astype()` 函数进行强制类型转换
- 自定义函数进行数据类型转换
- 使用 Pandas 提供的函数如 `to_numeric()`、`to_datetime()`

注意: 从上面两个例子可以看出，当待转换列中含有不能转换的特殊值时(例子中￥,ErrorValue等)astype()函数将失效。有些时候 `astype()` 函数执行成功了也并不一定代表着执行结果符合预期(神坑!）

参考: 

- [Pandas数据类型转换的几个小技巧](https://segmentfault.com/a/1190000014713098)
- [【原】十分钟搞定pandas](https://www.cnblogs.com/chaosimple/p/4153083.html)


## 在Pandas中更改列的数据类型

- `pd.to_datetime` 和 `pd.to_timedelta` 可将数据转换为日期和时间戳。
- `infer_objects()` 方法，用于将具有对象数据类型的 DataFrame 的列转换为更具体的类型。

## 查看数据类型

- `df.dtypes`
- `series.dtype`
- `get_dtype_counts()`

如果一列中含有多个类型,则该列的类型会是 `object`,同样字符串类型的列也会被当成 `object` 类型. 
不同的数据类型也会被当成 `object`,比如 `int32`,`float32`.

In [17]:
gtd_df.get_dtype_counts()

float64    55
int64      22
object     58
dtype: int64

In [18]:
count = gtd_df.isnull().sum()
percent = round(count / gtd_df.shape[0] * 100, 2)
series = [count, percent]
result = pd.concat(series, axis=1, keys=['Count','Percent'])

result.sort_values(by='Count', ascending=False)  # 按缺失值数量倒序

Unnamed: 0,Count,Percent
weaptype4,114179,100.00
weaptype4_txt,114179,100.00
weapsubtype4,114178,99.99
weapsubtype4_txt,114178,99.99
gsubname3,114164,99.98
claimmode3,114059,99.89
claimmode3_txt,114059,99.89
gsubname2,114030,99.87
divert,114027,99.86
kidhijcountry,113972,99.81


In [19]:
keep_attrs = result[result['Percent'] < 80.0]   # 保留缺失值小于 0.8 的特征
subset_df = gtd_df.loc[:, keep_attrs.index.values]

subset_df.get_dtype_counts()

int64      22
object     24
float64    21
dtype: int64

In [20]:
subset_df.shape

(114184, 67)

In [21]:
np.unique(subset_df.isnull().sum())

array([    0,     1,     3,     6,    10,    61,   172,   368,   380,
         434,   577,   934,  1126,  2676,  3368,  4280,  4505,  7033,
        8232,  9091, 10354, 13166, 38485, 59320, 59536, 60338, 64709,
       67348, 71553, 85296, 86954], dtype=int64)

### 符号说明:
    
符号|说明
:-|:-
NV| Numeric Variable
TV|Text Variable
CV|Categorical Variable

In [22]:
subset_df.shape

(114184, 67)

In [23]:
names = set(subset_df.columns)

# 数值变量集合
NV = {
    'eventid', 'iyear', 'imonth', 'iday', 'latitude','longitude',
    'nperps', 'nperpcap', 'nkill', 'nkillus', 'nkillter', 'nwound',
    'nwoundus', 'nwoundte', 'propvalue', 'nhostkid', 'nhostkidus',
    'nhours', 'ndays', 'ransomamt', 'ransomamtus', 'ransompaid','ransompaidus', 'nreleased'
}

NV = names & NV

TV = set(subset_df.select_dtypes('object').columns)
for name in TV:
    if not name.endswith('_txt'):
        print(name)

propcomment
scite3
summary
scite2
addnotes
corp1
weapdetail
provstate
dbsource
scite1
gname
motive
city
location
target1


In [24]:
CV = names - NV - TV
CV

{'INT_ANY',
 'INT_IDEO',
 'INT_LOG',
 'INT_MISC',
 'attacktype1',
 'claimed',
 'country',
 'crit1',
 'crit2',
 'crit3',
 'doubtterr',
 'extended',
 'guncertain1',
 'individual',
 'ishostkid',
 'multiple',
 'natlty1',
 'property',
 'propextent',
 'region',
 'specificity',
 'success',
 'suicide',
 'targsubtype1',
 'targtype1',
 'vicinity',
 'weapsubtype1',
 'weaptype1'}

In [25]:
NV

{'eventid',
 'iday',
 'imonth',
 'iyear',
 'latitude',
 'longitude',
 'nkill',
 'nkillter',
 'nkillus',
 'nperpcap',
 'nperps',
 'nwound',
 'nwoundte',
 'nwoundus',
 'propvalue'}

In [26]:
subset_df.shape

(114184, 67)

# 类别数据处理

In [27]:
cat_df = subset_df[list(CV)]
cat_df.isnull().sum()

natlty1          1126
INT_IDEO            0
targtype1           0
INT_ANY             0
ishostkid           3
guncertain1       380
country             0
specificity         6
suicide             0
individual          0
weapsubtype1     9091
property            0
crit3               0
targsubtype1     7033
claimed             0
success             0
crit2               0
extended            0
doubtterr           1
propextent      67348
INT_MISC            0
crit1               0
multiple            1
weaptype1           0
INT_LOG             0
vicinity            0
attacktype1         0
region              0
dtype: int64

In [28]:
cat_df.shape

(114184, 28)

In [29]:
subset_df.loc[subset_df['doubtterr'] == -9, 'doubtterr'] = np.nan  # -9 未知
#subset_df['attacktype1'].replace(9, -1)
subset_df.loc[subset_df['attacktype1'] == 9, 'attacktype1'] = np.nan #  9 未知
subset_df.loc[subset_df['weaptype1'] == 13, 'weaptype1'] = np.nan # 13 未知
subset_df.loc[subset_df['targtype1'] == 20, 'targtype1'] = np.nan # 13 未知
subset_df.loc[subset_df['property'] == -9, 'property'] = np.nan
subset_df.loc[subset_df['propextent'] == 4, 'propextent'] = np.nan
subset_df.loc[subset_df['ishostkid'] == -9, 'ishostkid'] = np.nan
subset_df.loc[subset_df['INT_LOG'] == -9, 'INT_LOG'] = np.nan
subset_df.loc[subset_df['INT_IDEO'] == -9, 'INT_IDEO'] = np.nan
subset_df.loc[subset_df['INT_MISC'] == -9, 'INT_MISC'] = np.nan
subset_df.loc[subset_df['INT_ANY'] == -9, 'INT_ANY'] = np.nan
subset_df.loc[subset_df['claimed'] == -9, 'claimed'] = np.nan
subset_df.loc[subset_df['specificity'] == 5, 'specificity'] = np.nan
subset_df.loc[subset_df['vicinity'] == -9, 'vicinity'] = np.nan

类别变量的缺失值使用 `-1` 来替换:

In [30]:
subset_df[list(CV)] = subset_df[list(CV)].replace(np.nan, -1)
subset_df[list(CV)].isnull().sum()

natlty1         0
INT_IDEO        0
targtype1       0
INT_ANY         0
ishostkid       0
guncertain1     0
country         0
specificity     0
suicide         0
individual      0
weapsubtype1    0
property        0
crit3           0
targsubtype1    0
claimed         0
success         0
crit2           0
extended        0
doubtterr       0
propextent      0
INT_MISC        0
crit1           0
multiple        0
weaptype1       0
INT_LOG         0
vicinity        0
attacktype1     0
region          0
dtype: int64

# 文本信息处理

In [31]:
txt_df = subset_df[list(TV)]
txt_df.isnull().sum()

propcomment         59320
scite3              71553
summary                 0
targsubtype1_txt     7033
weapsubtype1_txt     9091
propextent_txt      67348
region_txt              0
targtype1_txt           0
scite2              38485
addnotes            86954
corp1               10354
weapdetail          60338
provstate              10
dbsource                0
scite1                 61
country_txt             0
gname                   0
motive              64709
attacktype1_txt         0
natlty1_txt          1126
city                  434
location            59536
target1               172
weaptype1_txt           0
dtype: int64

In [32]:
txt_df.shape

(114184, 24)

In [33]:
txt_df = txt_df.replace('Unknown', np.nan)

In [34]:
txt_df.isnull().sum()

propcomment         59321
scite3              71553
summary                 0
targsubtype1_txt     7033
weapsubtype1_txt     9091
propextent_txt      80388
region_txt              0
targtype1_txt        5074
scite2              38485
addnotes            86954
corp1               26100
weapdetail          60338
provstate             906
dbsource                0
scite1                 61
country_txt             0
gname               59502
motive              79488
attacktype1_txt      4667
natlty1_txt          1126
city                 5529
location            59551
target1              6018
weaptype1_txt        8574
dtype: int64

In [35]:
subset_df[list(TV)] = subset_df[list(TV)].replace(np.nan, 'Unknown')
subset_df[list(TV)].isnull().sum()

propcomment         0
scite3              0
summary             0
targsubtype1_txt    0
weapsubtype1_txt    0
propextent_txt      0
region_txt          0
targtype1_txt       0
scite2              0
addnotes            0
corp1               0
weapdetail          0
provstate           0
dbsource            0
scite1              0
country_txt         0
gname               0
motive              0
attacktype1_txt     0
natlty1_txt         0
city                0
location            0
target1             0
weaptype1_txt       0
dtype: int64

# 数值型变量

In [36]:
subset_df[list(NV)].isnull().sum()

nkillter      2676
imonth           0
nperpcap      3368
propvalue    85296
nkill         4280
nwound        8232
iday             0
nwoundus       577
nperps       13166
nwoundte      4505
latitude       934
iyear            0
nkillus        368
longitude      934
eventid          0
dtype: int64

In [37]:
nv_df = subset_df[list(NV)].replace([-9, -99], np.nan)
nv_df.isnull().sum()

nkillter       2676
imonth            0
nperpcap       4269
propvalue    113767
nkill          4280
nwound         8232
iday              0
nwoundus        577
nperps        94424
nwoundte       4505
latitude        935
iyear             0
nkillus         368
longitude       934
eventid           0
dtype: int64

In [38]:
subset_df.shape

(114184, 67)

In [39]:
subset_df[list(NV)] = subset_df[list(NV)].replace([-9, -99], np.nan)
subset_df[list(names - NV)].isnull().sum()

propcomment         0
natlty1             0
targsubtype1_txt    0
propextent_txt      0
country             0
individual          0
property            0
addnotes            0
corp1               0
weapdetail          0
targsubtype1        0
claimed             0
success             0
extended            0
doubtterr           0
scite1              0
gname               0
propextent          0
multiple            0
vicinity            0
natlty1_txt         0
target1             0
weaptype1_txt       0
scite3              0
INT_IDEO            0
summary             0
weapsubtype1_txt    0
targtype1           0
region_txt          0
INT_ANY             0
guncertain1         0
targtype1_txt       0
ishostkid           0
specificity         0
suicide             0
weapsubtype1        0
scite2              0
crit3               0
provstate           0
dbsource            0
crit2               0
country_txt         0
INT_MISC            0
crit1               0
motive              0
weaptype1 

In [40]:
nv_df = subset_df[list(NV)]
count = nv_df.isnull().sum()
percent = round(count / nv_df.shape[0] * 100, 2)
series = [count, percent]
result = pd.concat(series, axis=1, keys=['Count','Percent'], sort=True)

result.sort_values(by='Count', ascending=False)  # 按缺失值数量倒序

Unnamed: 0,Count,Percent
propvalue,113767,99.63
nperps,94424,82.69
nwound,8232,7.21
nwoundte,4505,3.95
nkill,4280,3.75
nperpcap,4269,3.74
nkillter,2676,2.34
latitude,935,0.82
longitude,934,0.82
nwoundus,577,0.51


`'propvalu'`, `'nperps'` 两个特征缺失严重，故而，我们可以将其移除。

In [41]:
subset_df = subset_df.drop(['propvalue','nperps'], axis =1)

In [42]:
subset_df.shape

(114184, 65)

In [43]:
names = set(subset_df.columns)
NV = NV & names
CV = CV & names
TV = TV & names

In [44]:
len(NV | CV | TV) == len(names)

True

In [45]:
name_dict = {
    'NV': list(NV),
    'CV': list(CV),
    'TV': list(TV)
}

In [46]:
import json

with open('./data/gtd_1998-2017_names.json', 'w') as fp:
    json.dump(name_dict, fp)     # 保存在本地

In [47]:
subset_df

Unnamed: 0,eventid,iyear,imonth,iday,extended,country,country_txt,region,region_txt,provstate,...,ishostkid,addnotes,scite1,scite2,scite3,dbsource,INT_LOG,INT_IDEO,INT_MISC,INT_ANY
67507,199801010001,1998,1,1,0,34,Burundi,11,Sub-Saharan Africa,Bujumbura Mairie,...,0.0,Unknown,"Burundi Rebels, Ex-Rwandan Army Soldiers Blam...",Burundi--Attack Reported on Bujumbura Airport...,Unknown,CETIS,0.0,1.0,0.0,1.0
67508,199801010002,1998,1,1,0,167,Russia,9,Eastern Europe,Moscow (Federal City),...,0.0,Unknown,"Bomb injures 3 in Moscow subway system, The ...","Bomb injures 3 in Moscow subway, Charleston ...","Bomb Injures 3 Workers in Moscow Metro, Los ...",CETIS,-1.0,-1.0,0.0,-1.0
67509,199801010003,1998,1,1,0,603,United Kingdom,8,Western Europe,Northern Ireland,...,0.0,Unknown,Protestant gunmen kill Catholic in New Year's...,Ulster Peace Shattered by Shooting: Catholic ...,Unknown,CETIS,0.0,0.0,1.0,1.0
67510,199801020001,1998,1,2,0,95,Iraq,10,Middle East & North Africa,Baghdad,...,0.0,Unknown,Iraq Condemns Attack on UNSCOM Baghdad Office...,"Farouk Choukri , Iraq, UN Officials Continue ...","Iraqi Interior Minister on UNSCOM Attack, Kuw...",CETIS,-1.0,-1.0,1.0,1.0
67511,199801020002,1998,1,2,0,155,West Bank and Gaza Strip,10,Middle East & North Africa,West Bank,...,0.0,Unknown,"Woman Shot, The Philadelphia Inquirer, Janua...",Israeli Woman Critically Hurt by Gunfire in W...,Unknown,CETIS,-1.0,-1.0,0.0,-1.0
67512,199801040001,1998,1,4,0,118,Macedonia,9,Eastern Europe,Kumanovo (Municipality),...,0.0,Unknown,Officials Rule Out UCK Responsibility for Bom...,Kosovo Liberation Army Repeats Claim to FYROM...,"Ljubisa Stankovic, Prosecutor's Office Sees '...",CETIS,1.0,1.0,0.0,1.0
67513,199801040002,1998,1,4,0,118,Macedonia,9,Eastern Europe,Prilep (Municipality),...,0.0,Unknown,Officials Rule Out UCK Responsibility for Bom...,Kosovo Liberation Army Repeats Claim to FYROM...,"Ljubisa Stankovic, Prosecutor's Office Sees '...",CETIS,1.0,1.0,0.0,1.0
67514,199801050001,1998,1,5,0,15,Austria,8,Western Europe,Styria,...,0.0,Unknown,"J. Wolf, E. Holzer and E. Bieber, Technical C...","Ernst Bieber, Peter Grolig, and Wilhelm Theure...",Unknown,CETIS,-1.0,-1.0,0.0,-1.0
67515,199801050002,1998,1,5,0,86,Guyana,3,South America,Demerara-Mahaica,...,0.0,Unknown,"Urell Wilkinson, Two Bombs Explode in Front o...","Third Explosion in 36 Hours in Georgetown, E...",Manager Reports Hand Grenade Caused Explosion...,CETIS,-1.0,-1.0,0.0,-1.0
67516,199801050003,1998,1,5,0,168,Rwanda,11,Sub-Saharan Africa,Gitarama,...,0.0,Unknown,Rwanda--Forty Said Murdered in Gitarama Prefe...,Death Toll In Central Rwanda Rebel Attacks Ri...,Unknown,CETIS,0.0,1.0,0.0,1.0


In [48]:
subset_df.to_hdf('./data/gtd_1998-2017.h5', 'gtd_1998_2017') # 保存到本地

In [49]:
with open('./data/gtd_1998-2017_names.json') as fp:
    names = json.load(fp)

In [50]:
subset_df.shape

(114184, 65)

In [51]:
subset_df.head(4)

Unnamed: 0,eventid,iyear,imonth,iday,extended,country,country_txt,region,region_txt,provstate,...,ishostkid,addnotes,scite1,scite2,scite3,dbsource,INT_LOG,INT_IDEO,INT_MISC,INT_ANY
67507,199801010001,1998,1,1,0,34,Burundi,11,Sub-Saharan Africa,Bujumbura Mairie,...,0.0,Unknown,"Burundi Rebels, Ex-Rwandan Army Soldiers Blam...",Burundi--Attack Reported on Bujumbura Airport...,Unknown,CETIS,0.0,1.0,0.0,1.0
67508,199801010002,1998,1,1,0,167,Russia,9,Eastern Europe,Moscow (Federal City),...,0.0,Unknown,"Bomb injures 3 in Moscow subway system, The ...","Bomb injures 3 in Moscow subway, Charleston ...","Bomb Injures 3 Workers in Moscow Metro, Los ...",CETIS,-1.0,-1.0,0.0,-1.0
67509,199801010003,1998,1,1,0,603,United Kingdom,8,Western Europe,Northern Ireland,...,0.0,Unknown,Protestant gunmen kill Catholic in New Year's...,Ulster Peace Shattered by Shooting: Catholic ...,Unknown,CETIS,0.0,0.0,1.0,1.0
67510,199801020001,1998,1,2,0,95,Iraq,10,Middle East & North Africa,Baghdad,...,0.0,Unknown,Iraq Condemns Attack on UNSCOM Baghdad Office...,"Farouk Choukri , Iraq, UN Officials Continue ...","Iraqi Interior Minister on UNSCOM Attack, Kuw...",CETIS,-1.0,-1.0,1.0,1.0
