# 事前准备

- 0612: 先不获取对方数据

In [1]:
import pandas as pd
import numpy as np
import os, time, math
import collections

In [2]:
OUTPUT_FILE = 'NASS_CDS_20240612'

In [4]:
cur_dir = os.getcwd()
relative_data_dir = os.path.join(cur_dir, 'raw_data', 'NASS_CISS_data', 'NASS_CDS_raw')

# 数据读入

In [5]:
wanted_table_list = ['accident', 'event', 've', 'oa', 'gv', 'oi']
# oi, vi 暂无有用信息；event应该也没有但为了表连接先用上

raw_data = dict()  # 存储几种表格（按年份分开） -- {table: {year: data}}

for table in wanted_table_list:
    print('Extracting table: %s...\n' % table)
    raw_data[table] = dict()
    for year in range(
            2009, 2016
    ):  # 新旧AIS差别导致08以前的数据没有AIS15记载，故从10年开始使用数据 *注：可以尝试查看08以前的数据如何转换！
        raw_data[table][year] = pd.read_sas(
            os.path.join(relative_data_dir, str(year), 'FormattedData',
                         '%s.sas7bdat' % table))
        raw_data[table][year] = raw_data[table][year].dropna(
            how='all', axis=1)  # 删除全部为空值的列

count_init_length = 0
merge_centre = 'oa'
for year in range(2009, 2016):
    count_init_length += len(raw_data[merge_centre][year])
count_init_length

Extracting table: accident...

Extracting table: event...

Extracting table: ve...

Extracting table: oa...

Extracting table: gv...

Extracting table: oi...



56102

# CASEID数据为byte需要进行编码,同时统合每年数据到一起

In [6]:
raw_allyear_merged = dict()

for table in wanted_table_list:
    raw_allyear_merged[table] = pd.DataFrame()
    for year in range(2009, 2016):
        raw_data[table][year]['CASEID'] = raw_data[table][year][
            'CASEID'].str.decode('utf-8')
        
        raw_data[table][year]['CASENUMBER'] = str(
            year) + '_' + raw_data[table][year]['PSU'].astype(int).astype(
                str) + '_' + raw_data[table][year]['CASEID'].astype(str)  # 特征CASENUMBER = year_PSU_CASEID，用于表的连接
        
        raw_data[table][year]['YEAR'] = year
        
        raw_allyear_merged[table] = pd.concat([raw_allyear_merged[table], raw_data[table][year]])
        raw_allyear_merged[table].reset_index(inplace=True, drop=True)
        
    raw_allyear_merged[table] = raw_allyear_merged[table].dropna(how='all', axis=1)
        

# 删除重复列

In [7]:
duplicate_cols = [
    'YEAR', 'PSU', 'CASEID', 'CASENO', 'VERSION', 'STRATIF', 'RATWGT'
]

for table in wanted_table_list:
    if table != 'accident':
        for col in duplicate_cols:
            if col in raw_allyear_merged[table]:
                print("Delete duplicate column: %s from table: %s" %
                      (col, table))
                raw_allyear_merged[table] = raw_allyear_merged[table].drop(
                    columns=[col])
print('Duplicate removed')

Delete duplicate column: YEAR from table: event
Delete duplicate column: PSU from table: event
Delete duplicate column: CASEID from table: event
Delete duplicate column: CASENO from table: event
Delete duplicate column: VERSION from table: event
Delete duplicate column: STRATIF from table: event
Delete duplicate column: RATWGT from table: event
Delete duplicate column: YEAR from table: ve
Delete duplicate column: PSU from table: ve
Delete duplicate column: CASEID from table: ve
Delete duplicate column: CASENO from table: ve
Delete duplicate column: VERSION from table: ve
Delete duplicate column: STRATIF from table: ve
Delete duplicate column: RATWGT from table: ve
Delete duplicate column: YEAR from table: oa
Delete duplicate column: PSU from table: oa
Delete duplicate column: CASEID from table: oa
Delete duplicate column: CASENO from table: oa
Delete duplicate column: VERSION from table: oa
Delete duplicate column: STRATIF from table: oa
Delete duplicate column: RATWGT from table: oa
D

In [8]:
# 检查处理后的重复列，使用CASENUMBER和VEHNO进行结合！

duplicate_dict = collections.defaultdict(int)

for table in wanted_table_list:
    for col in raw_allyear_merged[table].columns:
        duplicate_dict[col] += 1

for key, val in duplicate_dict.items():
    if val > 1:
        print(key)

CASENUMBER
VEHNO
OCCNO


# 以oa为中心结合 - gv为另一侧 (两方都未与oi结合！)

In [9]:
merge_key = {
    'accident': ['CASENUMBER'],
    'event': ['CASENUMBER'],
    'gv': ['CASENUMBER', 'VEHNO'],
    've': ['CASENUMBER', 'VEHNO'],
    'oa': ['CASENUMBER', 'VEHNO', 'OCCNO']
}

ego_data = raw_allyear_merged['oa'].copy(deep=True)
other_data = raw_allyear_merged['gv'].copy(deep=True)

for table in ['accident', 've']:
    ego_data = pd.merge(ego_data, raw_allyear_merged[table], on=merge_key[table], how='left')
    other_data = pd.merge(other_data, raw_allyear_merged[table], on=merge_key[table], how='left')
for table in ['gv']:
    ego_data = pd.merge(ego_data, raw_allyear_merged[table], on=merge_key[table], how='inner')

ego_data.reset_index(drop=True, inplace=True)
other_data.reset_index(drop=True, inplace=True)

# 重构AISCODE2008 - AISCODE2015映射

In [10]:
REG = raw_allyear_merged['oi']['REGION08']
TYP = raw_allyear_merged['oi']['STRTYP08']
SPC = raw_allyear_merged['oi']['STRSPC08']
LVL = raw_allyear_merged['oi']['INJLVL08']
AIS = raw_allyear_merged['oi']['AIS08']

reconstruct_length = len(raw_allyear_merged['oi'].index)

# 重构AIS08的各个部分
REG_list = [0] * reconstruct_length
for i in REG.index:
    temp = np.nan
    if REG[i] >= 0:
        temp = str(int(REG[i]))
    REG_list[i] = temp
REG = pd.Series(REG_list)

TYP_list = [0] * reconstruct_length
for i in TYP.index:
    temp = np.nan
    if TYP[i] >= 0:
        temp = str(int(TYP[i]))
    TYP_list[i] = temp
TYP = pd.Series(TYP_list)

SPC_list = [0] * reconstruct_length
for i in SPC.index:
    temp = np.nan
    if SPC[i] >= 10:
        temp = str(int(SPC[i]))
    if SPC[i] < 10:
        temp = "0" + str(int(SPC[i]))
    SPC_list[i] = temp
SPC = pd.Series(SPC_list)

LVL_list = [0] * reconstruct_length
for i in LVL.index:
    temp = np.nan
    if LVL[i] >= 10:
        temp = str(int(LVL[i]))
    if LVL[i] < 10:
        temp = "0" + str(int(LVL[i]))
    LVL_list[i] = temp
LVL = pd.Series(LVL_list)

AIS_list = [0] * reconstruct_length
for i in AIS.index:
    temp = np.nan
    if AIS[i] >= 0 and AIS[i] < 7:
        temp = "." + str(int(AIS[i]))
    AIS_list[i] = temp
AIS = pd.Series(AIS_list)

raw_allyear_merged['oi']['AISCODE08'] = REG + TYP + SPC + LVL + AIS
raw_allyear_merged['oi'] = raw_allyear_merged['oi'].dropna(
    subset=['AISCODE08'])

## AIS08 -> AIS15映射

In [11]:
reflect_ais08_ais15=[
    ["630604.3","630603.2"],
    ["640201.3","610201.2"],
    ["640202.3","610202.2"],
    ["640401.3","610401.2"],
    ["640402.3","610402.2"],
    ["640601.3","610601.2"],
    ["640602.3","610602.2"],
    ["640240.5","610299.1"],
    ["640640.5","610699.1"],
    ["240904.2","240999.1"],
    ["640200.3","610299.1"],
    ["640400.3","610499.1"],
    ["640440.3","610499.1"],
    ["640600.3","610699.1"],
    ["131699.2","230407.3"],
    ["540822.2","540823.3"],
    ["541422.2","541423.3"],
    ["543622.2","543623.3"],
    ["650230.2","650236.3"],
    ["650430.2","650436.3"],
    ["650630.2","650636.3"],
    ["545299.1","545216.4"],
    ["545299.1","545214.3"],
    ["243099.1","243406.2"],
    ["251800.1","251816.2"],
    ["545299.1","545212.2"],
    ["874099.1","874036.2"],
    ["131699.2","230499.1"],
    ["350200.2","350230.1"],
    ["200099.9","213000.4"],
    ["200099.9","250620.3"],
    ["500099.9","513000.6"],
    ["500099.9","513002.6"],
    ["730099.9","730199.1"],
    ["730099.9","730102.1"],
    ["730099.9","730104.2"],
    ["730099.9","730106.2"],
    ["740099.9","740899.1"],
    ["740099.9","740699.1"],
    ["840099.9","840699.1"],
    ["840099.9","840700.1"],
    ["840099.9","840899.1"],
    ["541021.2","541021.3"],
    ["650208.2","650208.3"],
    ["713000.2","713000.3"],
    ["751101.2","751101.3"],
    ["751112.2","751112.3"],
    ["751152.2","751152.3"],
    ["751222.2","751222.3"],
    ["751252.2","751252.3"],
    ["751332.2","751332.3"],
    ["751352.2","751352.3"],
    ["751801.2","751801.3"],
    ["751901.2","751901.3"],
    ["752112.2","752112.3"],
    ["752114.2","752114.3"],
    ["752152.2","752152.3"],
    ["752154.2","752154.3"],
    ["752212.2","752212.3"],
    ["752214.2","752214.3"],
    ["752252.2","752252.3"],
    ["752254.2","752254.3"],
    ["752312.2","752312.3"],
    ["752314.2","752314.3"],
    ["752352.2","752352.3"],
    ["752354.2","752354.3"],
    ["752801.2","752801.3"],
    ["753201.2","753201.3"],
    ["630600.3","630600.2"],
    ["630602.3","630602.2"],
    ["630604.3","630604.2"],
    ["630606.3","630606.2"],
    ["630608.3","630608.2"],
    ["630610.3","630610.2"],
    ["440099.9","440099.1"],
    ["740099.9","740099.1"],
    ["840099.9","840099.1"],
    ["650218.2","650218.1"],
    ["650220.2","650220.1"],
    ["650418.2","650418.1"],
    ["650420.2","650420.1"],
    ["650618.2","650618.1"],
    ["650620.2","650620.1"],
    ["161000.1","161000.2"],
    ["161001.1","161001.2"],
]

In [12]:
aiscode_reflect = dict()
for i, key_val_list in enumerate(reflect_ais08_ais15):
    key, val = key_val_list[0],  key_val_list[1]
    if key not in aiscode_reflect:
        aiscode_reflect[key] = val
    else:
        # 看伤害部位！！！
        region_1 = aiscode_reflect.get(key)[0]
        region_2 = val[0]
        if region_1 == region_2:  # 同部位直接看AIS
            ais_1 = aiscode_reflect.get(key)[7]
            ais_2 = val[7]
            if ais_1 <= ais_2:
                aiscode_reflect[key] = val
        else:
            print("Regoin differs in AIS2015 and AIS2008 as %s -> %s" % (key, val))
        

In [13]:
raw_allyear_merged['oi']['AISCODE15'] = raw_allyear_merged['oi'][
    'AISCODE08'].copy(deep=True)

for ais08, ais15 in aiscode_reflect.items():
    raw_allyear_merged['oi'].loc[raw_allyear_merged['oi']['AISCODE08'] ==
                                 ais08, 'AISCODE15'] = ais15
    
for i in raw_allyear_merged['oi'].index:
    code = raw_allyear_merged['oi'].loc[i, 'AISCODE15']
    # 重构ais15各项以及重伤度
    raw_allyear_merged['oi'].loc[i, 'REGION15'] = int(code[0])
    raw_allyear_merged['oi'].loc[i, 'STRTYP15'] = int(code[1])    
    raw_allyear_merged['oi'].loc[i, 'STRSPC15'] = int(code[2:4])    
    raw_allyear_merged['oi'].loc[i, 'INJLVL15'] = int(code[4:6])
    
    raw_allyear_merged['oi'].loc[i, 'AIS15'] = int(code[7])
    

# 合并并只留下 AIS 最大受伤数据 

* oi的缺省值较多？！

In [14]:
oi = raw_allyear_merged['oi'].dropna(subset=['AISCODE15'])
oi_MAIS = oi.sort_values('AIS15', ascending=False).drop_duplicates(
    subset=['CASENUMBER', 'VEHNO', 'OCCNO'])

ego_data = pd.merge(ego_data, oi_MAIS, on=
                   ['CASENUMBER', 'VEHNO', 'OCCNO'], how='left')

In [15]:
ego_data['MAIS15'] =ego_data['AIS15'].copy(deep=True)

ego_data['MAISchange'] = 0
for i in ego_data.index:
    mais_08, mais_15 = ego_data.loc[i, 'MAIS08'], ego_data.loc[i, 'MAIS15']
    if mais_08 != mais_15 and mais_08 == mais_08 and mais_15 == mais_15:
        if mais_08 < mais_15:
            ego_data.loc[i, 'MAISchange'] = 1  # 表示变大了！
        else:
            ego_data.loc[i, 'MAISchange'] = -1  # 表示变小！
    if mais_15 != mais_15:
        ego_data.loc[i, 'MAIS15'] = mais_08
            

In [16]:
ego_data['MAISchange'].value_counts()

MAISchange
 0    55670
 1      398
-1       34
Name: count, dtype: int64

In [17]:
ego_data['MAIS'].value_counts()

MAIS
0.0    13687
1.0    10995
2.0     2717
3.0     1600
7.0     1336
4.0      671
5.0      406
6.0      193
Name: count, dtype: int64

# 处理特征



## 特征名对应

In [18]:
factor_list_nass = {
    'vehicle': {
        'MODELYR': 'Model Year',
        'CURBWGT': 'Curb Weight',  # 要用吗？？？  - 原本整备质量为记录值 * 10 （整备质量：裸车装满油不算乘员的质量）
#         'TRAVELSP': 'Travel Speed',
        'PREMOVE': 'premovement before collision',  # -- 还需要检查如何分类！
        'MANEUVER': 'Maneuver before collision',  # -- 还需要检查如何分类！
        'PDOF1': 'Clock-form Direction of force',
        'BODYTYPE': 'Body Type',
#         'otbdytyp': 'Body Type of the other vehicle'  # 对方车辆情报
    },
    'driver': {
        'DRPRES': 'Driver Present',  # 用于筛选
        'DRINKING': 'Alcohol Present',
        # 'DRUGS': 'Drug Present',
        'DRIVDIST': 'Distracted in Driving',  # 大于3的都是各种分心
        'SEX': 'Sex',  # 3~6 为怀孕 （时间越长编号越大）
        'AGE': 'Age',
        'HEIGHT': 'height',
        'WEIGHT': 'weight',
        'RACE': 'Race'  # 0612追加   ---  更新：欠缺值太多了，怎么办哇！
    },
    'environment': {
        'SPLIMIT': 'Speed Limit',  # 0为无速度限制 -- 换成大值！-- 还需要检查如何分类！
        'RELINTER': 'Related to Intersection',
        'TRAFFLOW': 'Traffic Flow Situation',
        'LANES': 'Number of lanes',
        'ALIGNMNT': 'Alignment of Road',
        'PROFILE': 'Uphill or Downhill',  # 用吗？
        'SURTYPE': 'Surface Type',  # 用于筛选-- 还需要检查如何分类！
        'SURCOND': 'Surface Condition',  # 路面条件，用吗？-- 还需要检查如何分类！
        'LGTCOND': 'Lighting Condition',
        'CLIMATE': 'Climate',  # -- 还需要检查如何分类！
        'TRAFCONT': 'Traffic Condition',  # 是否有学校等减速，用吗？  -- 还需要检查如何分类！
        # 'PREEVENT': 'Pre-crash Event', # 事故前事件，用吗？  -- 还需要检查如何分类！
        'PREILOC': 'Pre-event Location',
        'TRCTLFCT': 'Traffic Conrtol Functioning',
        'MANCOLL': 'Crash Type'  # 引用硕士研究说明这个东西可以被预测！  -  coding & editing manual 里有更详细分类
    },
    'time series': {
        'MONTH': 'month',
        'YEAR': 'year',
        'DAYWEEK': 'Day in Week'
    },
    'post crash': {
        'DVTOTAL': 'delta v',  # 0612追加 （for compare）
        'DVCONFID': 'delta v confidence level'
    }
}

In [19]:
included = 1
feature_count = 0

for cate, features in factor_list_nass.items():
    for factor in features.keys():
        feature_count += 1
        if factor not in ego_data.columns:
            print('No existance! %s' %factor)
            included = 0

if included:
    print('No factor missed!')
print('Feature number is: %s' %feature_count)

No factor missed!
Feature number is: 33


MANCOLL：
 - 0: No Collision
 - 1: REAR END (20-43)
 - 2: HEAD ON (50-63)
 - 4: ANGLE (68-91)
 - 5: SIDESWIPE, SAME DIRECTION (44-49)
 - 6: SIDESWIPE, OPPOSITE DIRECTION (64-67)\
 - 9: UNKNOWN

## 处理非nan缺失值



In [20]:
deal_ego = ego_data.copy(deep=True)
# deal_other = other_data.copy(deep=True)

In [27]:
max(ego_data['DVTOTAL'])

166.0

In [22]:
nanlist_NASS = [
    # vehicle
    ['MODELYR', 9999],
    ['CURBWGT', 999999],
#     ['TRAVELSP', 777],
#     ['TRAVELSP', 999],
    ['PREMOVE', 99],
    ['PREMOVE', 98],
    ['MANEUVER', 98],
    ['MANEUVER', 99],
#     ['otbdytyp', 98],
    ['BODYTYPE', 99],
    ['PDOF1', 998],

    # driver
    ['DRPRES', 9],
    ['DRINKING', 7],
    ['DRINKING', 8],
    ['DRINKING', 9],
    ['SEX', 9],
    ['HEIGHT', 999],
    ['WEIGHT', 999],
    ['AGE', 999],
    ['RACE', 7],
    ['RACE', 8],

    # environment
    ['SPLIMIT', 999],
    ['RELINTER', 9],
    ['TRAFFLOW', 9],
    ['LANES', 9],
    ['ALIGNMNT', 9],
    ['PROFILE', 9],
    ['SURTYPE', 8],
    ['SURTYPE', 9],
    ['SURCOND', 88],
    ['SURCOND', 99],
    ['LGTCOND', 9],
    ['CLIMATE', 98],
    ['CLIMATE', 99],
    ['TRAFCONT', 9],
    ['PREILOC', 9],
    ['MANCOLL', 0],
    ['MANCOLL', 9],
    
    # Compare
    ['DVTOTAL', 888],
    ['DVTOTAL', 999]
]

In [23]:
# for i, data in enumerate([deal_ego, deal_other]):
for i, data in enumerate([deal_ego]):
    for target in nanlist_NASS:
        if target[0] in data:
            initial_nan = data[target[0]].isnull().sum()
            data.loc[data[target[0]] == target[1], target[0]] = np.nan
            processed = data[target[0]].isnull().sum()
            if processed != initial_nan:
                print(
                    "The table %s got null at feature %s before processing: %s and after processing it got: %s more"
                    % (i+1, target[0], initial_nan, processed - initial_nan))
            

The table 1 got null at feature MANEUVER before processing: 24595 and after processing it got: 162 more
The table 1 got null at feature PDOF1 before processing: 15112 and after processing it got: 3499 more
The table 1 got null at feature DRINKING before processing: 2320 and after processing it got: 1577 more
The table 1 got null at feature DRINKING before processing: 3897 and after processing it got: 14 more
The table 1 got null at feature RACE before processing: 26271 and after processing it got: 382 more
The table 1 got null at feature RACE before processing: 26653 and after processing it got: 14 more
The table 1 got null at feature SURTYPE before processing: 8 and after processing it got: 87 more
The table 1 got null at feature SURCOND before processing: 102 and after processing it got: 20 more
The table 1 got null at feature CLIMATE before processing: 232 and after processing it got: 99 more
The table 1 got null at feature MANCOLL before processing: 1235 and after processing it got

In [28]:
# for data in [deal_ego, deal_other]:
for data in [deal_ego]:
    for category in factor_list_nass.keys():
        for old_name, new_name in factor_list_nass.get(category).items():
            if old_name in data.columns:
                data[new_name] = data[old_name]

## Curb Weight还原 & 分车种 -- 获取自己以及对方的Body Category

In [29]:
# for data in [deal_ego, deal_other]:
for data in [deal_ego]:
    data['Curb Weight'] = data['Curb Weight'] * 10

#     for prev_col, new_col in zip(['BODYTYPE', 'otbdytyp'],
#                                  ['Body Category', 'Other Veh Body Category']):
    for prev_col, new_col in zip(['BODYTYPE'], ['Body Category']):
        data.loc[(data[prev_col] <= 9) | (data[prev_col] == 11) |
                 (data[prev_col] == 12) | (data[prev_col] == 17),
                 new_col] = 0  #Sedan
        data.loc[((data[prev_col] >= 14) & (data[prev_col] <= 16)) |
                 (data[prev_col] == 19), new_col] = 1  #SUV
        data.loc[((data[prev_col] >= 20) & (data[prev_col] <= 29)) |
                 (data[prev_col] == 60), new_col] = 2  #Van
        data.loc[(data[prev_col] == 10) | ((data[prev_col] >= 30) &
                                           (data[prev_col] <= 39)) |
                 (data[prev_col] == 74), new_col] = 3  #Pickup

In [30]:
sum(deal_ego['Body Category'].value_counts())

56038

## 对于可以预见的事故，获取对方信息

In [None]:
# other_col_list = ['Clock-form Direction of force', 'TRAVELSP', 'premovement before collision']

# df_other = deal_other.copy(deep=True).drop_duplicates(subset=['CASENUMBER', 'VEHNO'] + other_col_list)

# df_other['VEHNO'] = df_other['OBJCONT1']
# new_col_list = []
# for col in other_col_list:
#     ocol = 'Other Veh '+ col
#     df_other[ocol] = df_other[col]
#     new_col_list.append(ocol)
# df_other = df_other[new_col_list + ['CASENUMBER', 'VEHNO']]
# data_with_oinfo = pd.merge(deal_ego, df_other, how='left', on=['CASENUMBER', 'VEHNO'])
# data_with_oinfo = data_with_oinfo.dropna(subset=['MAIS'])


In [None]:
# tmp = data_with_oinfo
# for col in new_col_list:
#     print(col, round(1 - len(tmp[col].dropna()) / len(tmp), 3))
    

## 数据筛选

In [31]:
query_conditions = [
    'VEHFORMS == 2',  # 两车相撞
    'MANEUVER != 0',  # 有驾驶员
    'ROLE == 1',  # 指定驾驶员
    'BODYTYPE < 80 & BODYTYPE != 13',  # 车辆类型限定
    'PREMOVE != 0',  # 有驾驶员
    'DRIVDIST != 0',  # 同上
    'AGE >= 15',  # 法律上
    'PREILOC != 0',
    'MAIS < 7'
]

In [32]:
# data_with_selection = data_with_oinfo.copy(deep=True)
data_with_selection = deal_ego.copy(deep=True)

for que in query_conditions:
    prev_len = len(data_with_selection)
    data_with_selection = data_with_selection.query(que)
    print('%s: cases: %s before -> %s left. True ratio: %s' %
          (que, prev_len, len(data_with_selection),
           int(len(data_with_selection.query('MAIS > 2')) / len(data_with_selection)*10000) / 10000))
data_with_selection.reset_index(drop=True, inplace=True)

VEHFORMS == 2: cases: 56102 before -> 33389 left. True ratio: 0.0601
MANEUVER != 0: cases: 33389 before -> 33380 left. True ratio: 0.0601
ROLE == 1: cases: 33380 before -> 22298 left. True ratio: 0.0616
BODYTYPE < 80 & BODYTYPE != 13: cases: 22298 before -> 22298 left. True ratio: 0.0616
PREMOVE != 0: cases: 22298 before -> 22298 left. True ratio: 0.0616
DRIVDIST != 0: cases: 22298 before -> 22298 left. True ratio: 0.0616
AGE >= 15: cases: 22298 before -> 21999 left. True ratio: 0.0621
PREILOC != 0: cases: 21999 before -> 21999 left. True ratio: 0.0621
MAIS < 7: cases: 21999 before -> 12359 left. True ratio: 0.0706


In [None]:
# for oinfo in new_col_list:
#     print('Info name: %s, Valid oinfo case: %s, with ratio: %s' %(
#         oinfo, 
#         sum(data_with_selection[oinfo].value_counts()), 
#         sum(data_with_selection[oinfo].value_counts()) / len(data_with_selection))
#         )
    
# Travel Speed差太多了！！！

## 提取需要的列

In [33]:
# wanted_features = [ 'MAIS', 'Other Veh Body Category', 'Body Category'] + new_col_list
wanted_features = ['MAIS', 'Body Category']

for cate in factor_list_nass.keys():
    for sas_name, feature_name in factor_list_nass.get(cate).items():
        if sas_name in ('BODYTYPE', 'otbdytyp', 'DRPRES'):
            continue
        wanted_features.append(feature_name)
        
wanted_features, len(wanted_features)


(['MAIS',
  'Body Category',
  'Model Year',
  'Curb Weight',
  'premovement before collision',
  'Maneuver before collision',
  'Clock-form Direction of force',
  'Alcohol Present',
  'Distracted in Driving',
  'Sex',
  'Age',
  'height',
  'weight',
  'Race',
  'Speed Limit',
  'Related to Intersection',
  'Traffic Flow Situation',
  'Number of lanes',
  'Alignment of Road',
  'Uphill or Downhill',
  'Surface Type',
  'Surface Condition',
  'Lighting Condition',
  'Climate',
  'Traffic Condition',
  'Pre-event Location',
  'Traffic Conrtol Functioning',
  'Crash Type',
  'month',
  'year',
  'Day in Week',
  'delta v',
  'delta v confidence level'],
 33)

In [34]:
for info in wanted_features:
    true_info_ratio = sum(data_with_selection[info].value_counts()) / len(data_with_selection)
    if true_info_ratio < 0.95:
        print('Info name: %s, Valid info case: %s, with ratio: %s' %(
            info, 
            sum(data_with_selection[info].value_counts()), 
            round(true_info_ratio, 3)
            ))
        if true_info_ratio < 0.5:
            print( "*" * 50, '\n', info, 'Warining: 数据少一半以上！\n', "*" * 50)
    

Info name: Maneuver before collision, Valid info case: 9118, with ratio: 0.738
Info name: Clock-form Direction of force, Valid info case: 9295, with ratio: 0.752
Info name: Distracted in Driving, Valid info case: 7804, with ratio: 0.631
Info name: height, Valid info case: 9965, with ratio: 0.806
Info name: weight, Valid info case: 10163, with ratio: 0.822
Info name: Race, Valid info case: 9632, with ratio: 0.779
Info name: Crash Type, Valid info case: 11519, with ratio: 0.932
Info name: delta v, Valid info case: 7429, with ratio: 0.601


## 考虑到预测时候可能不需要Maneuver因为太接近事故发生，所以dropna时候就不drop它！

dropna时不drop的属性： MANEUVER, , ALCOHOL, DRIVDIST, DV

## dropna过程

In [35]:
no_dropna_cols = [
    'Other Veh Clock-form Direction of force',
    'Other Veh TRAVELSP',
    'Other Veh premovement before collision',
    'Travel Speed',
    'Alcohol Present',
    'Distracted in Driving',
    'Maneuver before collision',
    'Other Veh Body Category',
    'delta v',
    'delta v confidence level',
    ]

dropna_cols = list(set(wanted_features) - set(no_dropna_cols))
dropna_cols


['Traffic Conrtol Functioning',
 'Body Category',
 'Surface Condition',
 'month',
 'year',
 'Surface Type',
 'Lighting Condition',
 'premovement before collision',
 'Number of lanes',
 'MAIS',
 'Climate',
 'Age',
 'Related to Intersection',
 'Crash Type',
 'Uphill or Downhill',
 'height',
 'Model Year',
 'Curb Weight',
 'Race',
 'Sex',
 'Traffic Condition',
 'Alignment of Road',
 'Clock-form Direction of force',
 'Pre-event Location',
 'weight',
 'Speed Limit',
 'Day in Week',
 'Traffic Flow Situation']

In [36]:
for col in dropna_cols:
    print(sum(data_with_selection[col].value_counts()), '\t', col)

12344 	 Traffic Conrtol Functioning
12346 	 Body Category
12340 	 Surface Condition
12359 	 month
12359 	 year
12333 	 Surface Type
12306 	 Lighting Condition
12354 	 premovement before collision
12358 	 Number of lanes
12359 	 MAIS
12300 	 Climate
12359 	 Age
12359 	 Related to Intersection
11519 	 Crash Type
12304 	 Uphill or Downhill
9965 	 height
12354 	 Model Year
12177 	 Curb Weight
9632 	 Race
12359 	 Sex
12355 	 Traffic Condition
12359 	 Alignment of Road
9295 	 Clock-form Direction of force
12327 	 Pre-event Location
10163 	 weight
12221 	 Speed Limit
12359 	 Day in Week
12358 	 Traffic Flow Situation


In [37]:
data_dropna = data_with_selection.copy(deep=True).dropna(subset=dropna_cols)
data_dropna.reset_index(drop=True, inplace=True)
data_dropna

Unnamed: 0,ABELTAVL,ABELTUSE,ABELTYPE,ABLTFAIL,AGE,BAGAVAIL,BAGAVRPT,BAGDEPLY,BAGEVENT,BAGDAMAG,...,Traffic Condition,Pre-event Location,Traffic Conrtol Functioning,Crash Type,month,year,Day in Week,delta v,delta v confidence level,Body Category
0,0.0,0.0,0.0,0.0,38.0,1.0,1.0,7.0,97.0,97.0,...,0.0,1.0,0.0,1.0,1.0,2009,4.0,,0.0,3.0
1,0.0,0.0,0.0,0.0,65.0,1.0,2.0,1.0,1.0,1.0,...,1.0,1.0,2.0,4.0,2.0,2009,7.0,32.0,1.0,3.0
2,0.0,0.0,0.0,0.0,43.0,1.0,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,6.0,1.0,2009,1.0,22.0,3.0,0.0
3,0.0,0.0,0.0,0.0,49.0,1.0,2.0,1.0,1.0,1.0,...,0.0,1.0,0.0,4.0,1.0,2009,5.0,28.0,1.0,1.0
4,0.0,0.0,0.0,0.0,30.0,1.0,1.0,1.0,1.0,1.0,...,2.0,1.0,2.0,4.0,3.0,2009,7.0,23.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6741,,,,,32.0,1.0,3.0,7.0,97.0,97.0,...,2.0,1.0,2.0,4.0,8.0,2015,1.0,15.0,1.0,0.0
6742,,,,,26.0,1.0,3.0,7.0,97.0,97.0,...,1.0,1.0,2.0,1.0,8.0,2015,5.0,12.0,4.0,0.0
6743,,,,,62.0,1.0,3.0,7.0,97.0,97.0,...,2.0,1.0,2.0,4.0,10.0,2015,6.0,19.0,1.0,1.0
6744,,,,,43.0,1.0,3.0,7.0,97.0,97.0,...,0.0,1.0,0.0,4.0,11.0,2015,3.0,,0.0,0.0


In [38]:
for col in wanted_features:
    print(col, '___', round(sum(data_dropna[col].value_counts()) / len(data_dropna), 3))

MAIS ___ 1.0
Body Category ___ 1.0
Model Year ___ 1.0
Curb Weight ___ 1.0
premovement before collision ___ 1.0
Maneuver before collision ___ 0.88
Clock-form Direction of force ___ 1.0
Alcohol Present ___ 0.959
Distracted in Driving ___ 0.805
Sex ___ 1.0
Age ___ 1.0
height ___ 1.0
weight ___ 1.0
Race ___ 1.0
Speed Limit ___ 1.0
Related to Intersection ___ 1.0
Traffic Flow Situation ___ 1.0
Number of lanes ___ 1.0
Alignment of Road ___ 1.0
Uphill or Downhill ___ 1.0
Surface Type ___ 1.0
Surface Condition ___ 1.0
Lighting Condition ___ 1.0
Climate ___ 1.0
Traffic Condition ___ 1.0
Pre-event Location ___ 1.0
Traffic Conrtol Functioning ___ 1.0
Crash Type ___ 1.0
month ___ 1.0
year ___ 1.0
Day in Week ___ 1.0
delta v ___ 0.751
delta v confidence level ___ 1.0


# 保存全特征以及提取所需特征数据

## 处理完的只剩需要特征的数据

In [39]:
data_dropna.to_csv(os.path.join(os.path.dirname(os.curdir),
                                'CreatedData/NASS/',
                                OUTPUT_FILE + '_Dropna.csv'),
                   encoding='utf-8')

## 未dropna的全数据

In [40]:
data_with_selection.to_csv(os.path.join(os.path.dirname(
    os.curdir), 'CreatedData/NASS/', OUTPUT_FILE + '_NoDropAllFeatures.csv'),
                           encoding='utf-8')


# 对缺失比例不高的变量是否使用某些方法进行填补？

- Alcohol Present ___ 0.9609115605191669
- Distracted in Driving ___ 0.7879565348626623

In [None]:
data_dropna