In [2]:
import pandas as pd
import numpy as np

%pylab inline

gtd_df = pd.read_csv('data/globalterrorismdb_0718dist.csv', encoding='ISO-8859-1', low_memory=False)  # 载入数据

gtd_df.info() # 在对数据进行处理之前应该先查看加载数据的相关信息

Populating the interactive namespace from numpy and matplotlib
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181691 entries, 0 to 181690
Columns: 135 entries, eventid to related
dtypes: float64(55), int64(22), object(58)
memory usage: 187.1+ MB


# 1997 年之前的数据处理

由于一些历史遗留问题，目前只有在 1997 年后发生的事件才是系统可用的变量: `summary`, `doubtterr`, `alternative`, `multiple`, `related`, `individual`, `nperpcap`, `claimed`, `claimmode`, `compclaim`, `claim2`, `claimmode2`, `claim3`, `claimmode3`, `motive`, `ransomnote`, `addnotes`, `scite1`, `scite2`, `scite3`, 故而，我们需要将数据集划分为 1997 年前后的数据:

In [2]:
le_set = {
    'summary',
    'doubtterr',
    'alternative',
    'multiple',
    'related',
    'individual',
    'nperpcap',
    'claimed',
    'claimmode',
    'compclaim',
    'claim2',
    'claimmode2',
    'claim3',
    'claimmode3',
    'motive',
    'ransomnote',
    'addnotes',
    'scite1',
    'scite2',
    'scite3'
}

In [3]:
names = set(gtd_df.columns)
leV = names - le_set

gtd_1997 = gtd_df.query('iyear<1998')[list(leV)]

In [4]:
gtd_1997.shape

(67507, 115)

In [5]:
count_1997 = gtd_1997.isnull().sum()
percent = round(count_1997 / gtd_1997.shape[0] * 100, 2)
series = [count_1997, percent]
result = pd.concat(series, axis=1, keys=['Count','Percent'])
result.sort_values(by='Count', ascending=False)  # 按缺失值数量倒序

Unnamed: 0,Count,Percent
gsubname3,67507,100.00
ransomamtus,67506,100.00
attacktype3_txt,67504,100.00
attacktype3,67504,100.00
ransompaidus,67502,99.99
gsubname2,67501,99.99
claimmode3_txt,67499,99.99
gname3,67496,99.98
guncertain3,67496,99.98
claimmode2_txt,67488,99.97


In [6]:
target_attrs = result[result['Percent']<99]
print('特征所占比例:', round(target_attrs.shape[0]/result.shape[0], 2))

特征所占比例: 0.66


缺失值占比超过 $99\%$ 占比达 $33\%$。故而，我们可以直接将其丢弃。

In [11]:
df_50_90 = target_attrs[50<target_attrs['Percent']]
round(df_50_90.shape[0]/target_attrs.shape[0], 2)  # 特征数所占比例

0.38

缺失值所占比例并不多，故而我们仅仅取缺失值占比小于 $50$，且下面主要看看 1997 年前的数据的 text 变量:

In [12]:
gtd_1997 = gtd_1997[result[result['Percent']<50].index.tolist()]

In [13]:
np.unique(gtd_1997.select_dtypes('object').isin(['Unknown']))

array([False,  True])

`gtd_1997` 中存在值 `'Unknown'`，故而，我们将文本变量的所有缺失值替换为 `'Unknown'`。

In [15]:
txt_index_1997 = gtd_1997.select_dtypes('object').columns.tolist()

gtd_1997[txt_index_1997] = gtd_1997[txt_index_1997].replace(np.nan, 'Unknown')

np.unique(gtd_1997[txt_index_1997].isnull().sum())  # 文本变量的缺失值填充完毕

array([0], dtype=int64)

In [16]:
gtd_1997.isnull().sum()

longitude            3623
nkill                6033
weapdetail              0
region_txt              0
eventid                 0
specificity             0
corp1                   0
extended                0
iday                    0
guncertain1             0
target1                 0
natlty1_txt             0
INT_ANY                 0
crit3                   0
attacktype1             0
country                 0
natlty1               433
iyear                   0
region                  0
INT_MISC                0
property                0
crit2                   0
provstate               0
attacktype1_txt         0
gname                   0
country_txt             0
INT_IDEO                0
weaptype1_txt           0
imonth                  0
ishostkid             175
crit1                   0
weapsubtype1        11677
targtype1               0
success                 0
city                    0
suicide                 0
targsubtype1         3340
vicinity                0
ransom      

In [19]:
gtd_1997.to_hdf('./data/gtd_1997.h5', 'gtd_1997')

# 1997 年之后的数据处理

In [3]:
df = pd.read_csv('data/globalterrorismdb_0718dist.csv', encoding='ISO-8859-1', low_memory=False)  # 载入数据

In [4]:
gtd_df = df.query('iyear>1997')

In [5]:
# total no of columns and rows present in data
print("数据的尺寸:",gtd_df.shape)

# Check the number of missing values in each attribute
count = gtd_df.isnull().sum()
percent = round(count / gtd_df.shape[0] * 100, 2)
series = [count, percent]
result = pd.concat(series, axis=1, keys=['Count','Percent'])
result.sort_values(by='Count', ascending=False)  # 按缺失值数量倒序

数据的尺寸: (114184, 135)


Unnamed: 0,Count,Percent
weaptype4,114179,100.00
weaptype4_txt,114179,100.00
weapsubtype4,114178,99.99
weapsubtype4_txt,114178,99.99
gsubname3,114164,99.98
claimmode3,114059,99.89
claimmode3_txt,114059,99.89
gsubname2,114030,99.87
divert,114027,99.86
kidhijcountry,113972,99.81


也可:
    
```py
# Removing columns which has 80% null values
def remove_columns_missing_values(data, min_threshold):
    for col in data.columns:
        rate = data[col].isnull().sum()/float(len(data)) * 100
        if rate >= min_threshold:
            data = data.drop(col, 1)
    return data

data = remove_columns_missing_values(df , 80)
print("可以得到的特征 :",len(data.columns))
```

Pandas 中进行数据类型转换有三种基本方法：

- 使用 `astype()` 函数进行强制类型转换
- 自定义函数进行数据类型转换
- 使用 Pandas 提供的函数如 `to_numeric()`、`to_datetime()`

注意: 从上面两个例子可以看出，当待转换列中含有不能转换的特殊值时(例子中￥,ErrorValue等)astype()函数将失效。有些时候 `astype()` 函数执行成功了也并不一定代表着执行结果符合预期(神坑!）

参考: 

- [Pandas数据类型转换的几个小技巧](https://segmentfault.com/a/1190000014713098)
- [【原】十分钟搞定pandas](https://www.cnblogs.com/chaosimple/p/4153083.html)


## 在Pandas中更改列的数据类型

- `pd.to_datetime` 和 `pd.to_timedelta` 可将数据转换为日期和时间戳。
- `infer_objects()` 方法，用于将具有对象数据类型的 DataFrame 的列转换为更具体的类型。

## 查看数据类型

- `df.dtypes`
- `series.dtype`
- `get_dtype_counts()`

如果一列中含有多个类型,则该列的类型会是 `object`,同样字符串类型的列也会被当成 `object` 类型. 
不同的数据类型也会被当成 `object`,比如 `int32`,`float32`.

In [6]:
gtd_df.get_dtype_counts()

float64    55
int64      22
object     58
dtype: int64

In [8]:
count = gtd_df.isnull().sum()
percent = round(count / gtd_df.shape[0] * 100, 2)
series = [count, percent]
result = pd.concat(series, axis=1, keys=['Count','Percent'], sort=True)

result.sort_values(by='Count', ascending=False)  # 按缺失值数量倒序

Unnamed: 0,Count,Percent
weaptype4_txt,114179,100.00
weaptype4,114179,100.00
weapsubtype4_txt,114178,99.99
weapsubtype4,114178,99.99
gsubname3,114164,99.98
claimmode3_txt,114059,99.89
claimmode3,114059,99.89
gsubname2,114030,99.87
divert,114027,99.86
kidhijcountry,113972,99.81


In [9]:
keep_attrs = result[result['Percent'] < 80.0]   # 保留缺失值小于 0.8 的特征
subset_df = df.loc[:, keep_attrs.index.values]

subset_df.get_dtype_counts()

int64      22
object     24
float64    21
dtype: int64

In [10]:
subset_df.shape

(181691, 67)

In [11]:
np.unique(subset_df.isnull().sum())

array([     0,      1,      6,    178,    380,    421,    434,    636,
         1559,   4556,   4557,  10313,  10373,  16311,  20768,  42550,
        64446,  64702,  66120,  66129,  66191,  66958,  67670,  69143,
        69489,  71115, 104758, 117626, 123732, 126196, 131130, 138175,
       142702, 153402], dtype=int64)

### 符号说明:
    
符号|说明
:-|:-
NV| Numeric Variable
TV|Text Variable
CV|Categorical Variable

In [12]:
names = set(subset_df.columns)

# 数值变量集合
NV = {
    'eventid', 'iyear', 'imonth', 'iday', 'latitude','longitude',
    'nperps', 'nperpcap', 'nkill', 'nkillus', 'nkillter', 'nwound',
    'nwoundus', 'nwoundte', 'propvalue', 'nhostkid', 'nhostkidus',
    'nhours', 'ndays', 'ransomamt', 'ransomamtus', 'ransompaid','ransompaidus', 'nreleased'
}

NV = names & NV

TV = set(subset_df.select_dtypes('object').columns)
for name in TV:
    if not name.endswith('_txt'):
        print(name)

dbsource
scite3
propcomment
target1
corp1
provstate
city
scite1
weapdetail
addnotes
motive
gname
location
summary
scite2


In [13]:
CV = names - NV - TV
CV

{'INT_ANY',
 'INT_IDEO',
 'INT_LOG',
 'INT_MISC',
 'attacktype1',
 'claimed',
 'country',
 'crit1',
 'crit2',
 'crit3',
 'doubtterr',
 'extended',
 'guncertain1',
 'individual',
 'ishostkid',
 'multiple',
 'natlty1',
 'property',
 'propextent',
 'region',
 'specificity',
 'success',
 'suicide',
 'targsubtype1',
 'targtype1',
 'vicinity',
 'weapsubtype1',
 'weaptype1'}

In [14]:
NV

{'eventid',
 'iday',
 'imonth',
 'iyear',
 'latitude',
 'longitude',
 'nkill',
 'nkillter',
 'nkillus',
 'nperpcap',
 'nperps',
 'nwound',
 'nwoundte',
 'nwoundus',
 'propvalue'}

In [15]:
subset_df.shape

(181691, 67)

# 类别数据处理

In [16]:
cat_df = subset_df[list(CV)]
cat_df.isnull().sum()

natlty1           1559
individual           0
suicide              0
region               0
success              0
crit2                0
weapsubtype1     20768
INT_ANY              0
targtype1            0
guncertain1        380
weaptype1            0
country              0
claimed          66120
INT_LOG              0
specificity          6
INT_MISC             0
propextent      117626
extended             0
INT_IDEO             0
targsubtype1     10373
multiple             1
property             0
attacktype1          0
doubtterr            1
crit1                0
vicinity             0
crit3                0
ishostkid          178
dtype: int64

In [17]:
cat_df.shape

(181691, 28)

In [20]:
subset_df.loc[subset_df['doubtterr'] == -9, 'doubtterr'] = np.nan  # -9 未知
#subset_df['attacktype1'].replace(9, -1)
subset_df.loc[subset_df['attacktype1'] == 9, 'attacktype1'] = np.nan #  9 未知
subset_df.loc[subset_df['weaptype1'] == 13, 'weaptype1'] = np.nan # 13 未知
subset_df.loc[subset_df['targtype1'] == 20, 'targtype1'] = np.nan # 13 未知
subset_df.loc[subset_df['property'] == -9, 'property'] = np.nan
subset_df.loc[subset_df['propextent'] == 4, 'propextent'] = np.nan
subset_df.loc[subset_df['ishostkid'] == -9, 'ishostkid'] = np.nan
subset_df.loc[subset_df['INT_LOG'] == -9, 'INT_LOG'] = np.nan
subset_df.loc[subset_df['INT_IDEO'] == -9, 'INT_IDEO'] = np.nan
subset_df.loc[subset_df['INT_MISC'] == -9, 'INT_MISC'] = np.nan
subset_df.loc[subset_df['INT_ANY'] == -9, 'INT_ANY'] = np.nan
subset_df.loc[subset_df['claimed'] == -9, 'claimed'] = np.nan
subset_df.loc[subset_df['specificity'] == 5, 'specificity'] = np.nan
subset_df.loc[subset_df['vicinity'] == -9, 'vicinity'] = np.nan

In [21]:
subset_df[list(CV)] = subset_df[list(CV)].replace(np.nan, -1)
subset_df[list(CV)].isnull().sum()

natlty1         0
individual      0
suicide         0
region          0
success         0
crit2           0
weapsubtype1    0
INT_ANY         0
targtype1       0
guncertain1     0
weaptype1       0
country         0
claimed         0
INT_LOG         0
specificity     0
INT_MISC        0
propextent      0
extended        0
INT_IDEO        0
targsubtype1    0
multiple        0
property        0
attacktype1     0
doubtterr       0
crit1           0
vicinity        0
crit3           0
ishostkid       0
dtype: int64

# 文本信息处理

In [22]:
txt_df = subset_df[list(TV)]
txt_df.isnull().sum()

dbsource                 0
scite3              138175
propcomment         123732
natlty1_txt           1559
target1                636
attacktype1_txt          0
country_txt              0
corp1                42550
provstate              421
region_txt               0
city                   434
scite1               66191
weapdetail           67670
weapsubtype1_txt     20768
propextent_txt      117626
targsubtype1_txt     10373
weaptype1_txt            0
addnotes            153402
motive              131130
targtype1_txt            0
gname                    0
location            126196
summary              66129
scite2              104758
dtype: int64

In [23]:
txt_df.shape

(181691, 24)

In [24]:
txt_df = txt_df.replace('Unknown', np.nan)

In [25]:
txt_df.isnull().sum()

dbsource                 0
scite3              138175
propcomment         123733
natlty1_txt           1559
target1               6554
attacktype1_txt       7276
country_txt              0
corp1                58906
provstate             4711
region_txt               0
city                 10209
scite1               66191
weapdetail           67670
weapsubtype1_txt     20768
propextent_txt      137472
targsubtype1_txt     10373
weaptype1_txt        15157
addnotes            153402
motive              146019
targtype1_txt         5898
gname                82782
location            126212
summary              66129
scite2              104758
dtype: int64

In [26]:
subset_df[list(TV)] = subset_df[list(TV)].replace(np.nan, 'Unknown')
subset_df[list(TV)].isnull().sum()

dbsource            0
scite3              0
propcomment         0
natlty1_txt         0
target1             0
attacktype1_txt     0
country_txt         0
corp1               0
provstate           0
region_txt          0
city                0
scite1              0
weapdetail          0
weapsubtype1_txt    0
propextent_txt      0
targsubtype1_txt    0
weaptype1_txt       0
addnotes            0
motive              0
targtype1_txt       0
gname               0
location            0
summary             0
scite2              0
dtype: int64

# 数值型变量

In [27]:
subset_df[list(NV)].isnull().sum()

nwoundte      69143
eventid           0
nkillus       64446
longitude      4557
nperpcap      69489
imonth            0
nkillter      66958
nkill         10313
iday              0
iyear             0
latitude       4556
nwound        16311
nperps        71115
nwoundus      64702
propvalue    142702
dtype: int64

In [28]:
nv_df = subset_df[list(NV)].replace([-9, -99], np.nan)
nv_df.isnull().sum()

nwoundte      69143
eventid           0
nkillus       64446
longitude      4557
nperpcap      71355
imonth            0
nkillter      66958
nkill         10313
iday              0
iyear             0
latitude       4557
nwound        16311
nperps       153335
nwoundus      64702
propvalue    171315
dtype: int64

In [30]:
subset_df.shape

(181691, 67)

In [31]:
subset_df[list(NV)] = subset_df[list(NV)].replace([-9, -99], np.nan)
subset_df[list(names - NV)].isnull().sum()

natlty1             0
individual          0
scite3              0
propcomment         0
region              0
natlty1_txt         0
country_txt         0
success             0
crit2               0
region_txt          0
weapsubtype1        0
targtype1           0
propextent_txt      0
country             0
claimed             0
specificity         0
propextent          0
INT_IDEO            0
weaptype1_txt       0
addnotes            0
motive              0
attacktype1         0
gname               0
location            0
summary             0
ishostkid           0
dbsource            0
suicide             0
attacktype1_txt     0
target1             0
corp1               0
provstate           0
INT_ANY             0
city                0
scite1              0
weapdetail          0
guncertain1         0
weapsubtype1_txt    0
weaptype1           0
INT_LOG             0
targsubtype1_txt    0
INT_MISC            0
extended            0
targsubtype1        0
multiple            0
targtype1_

In [32]:
nv_df = subset_df[list(NV)]
count = nv_df.isnull().sum()
percent = round(count / nv_df.shape[0] * 100, 2)
series = [count, percent]
result = pd.concat(series, axis=1, keys=['Count','Percent'], sort=True)

result.sort_values(by='Count', ascending=False)  # 按缺失值数量倒序

Unnamed: 0,Count,Percent
propvalue,171315,94.29
nperps,153335,84.39
nperpcap,71355,39.27
nwoundte,69143,38.06
nkillter,66958,36.85
nwoundus,64702,35.61
nkillus,64446,35.47
nwound,16311,8.98
nkill,10313,5.68
latitude,4557,2.51


`'propvalu'`, `'nperps'` 两个特征缺失严重，故而，我们可以将其移除。

In [33]:
subset_df = subset_df.drop(['propvalue','nperps'], axis =1)

In [34]:
names = set(subset_df.columns)
NV = names & names
CV = CV & names
TV = TV & names

In [36]:
len(NV | CV | TV) == len(names)

True

In [43]:
name_dict = {
    'NV': list(NV),
    'CV': list(CV),
    'TV': list(TV)
}

In [44]:
import json

with open('./data/gtd_1998-2017_names.json', 'w') as fp:
    json.dump(name_dict, fp)     # 保存在本地

In [38]:
subset_df.to_hdf('./data/gtd_1998-2017.h5', 'gtd_1998_2017') # 保存到本地

In [45]:
with open('./data/gtd_1998-2017_names.json') as fp:
    names = json.load(fp)