# Combination & Feature Selection

In [1]:
import pandas as pd
import numpy as np
import os, datetime, math

In [2]:
nass_path = os.path.join(os.path.dirname(os.curdir), 'NASS/',
                         'NASS_CDS_20240612_Dropna.csv')

ciss_path = os.path.join(os.path.dirname(os.curdir), 'CISS/',
                         'CISS_20240612_dropna.csv')

nass, ciss = pd.read_csv(nass_path), pd.read_csv(ciss_path)

In [3]:
assert 'CASEWGT' in ciss.columns and 'RATWGT' in nass.columns
nass['CASEWGT'] = nass['RATWGT']

In [7]:
medical_records = {
    'CARDIOCOND',
    'SPINEDEGEN',
    'IMPAIREDCOAG',
    'IMPLANTFUS',
    'OSTEOCOND',
    'COMORBOTH'
}

for medical in medical_records:
    assert medical in ciss.columns

In [8]:
nass['source'] = 1
ciss['source'] = 2

for med in medical_records:
    nass[med] = 0

In [9]:
nass

Unnamed: 0.1,Unnamed: 0,ABELTAVL,ABELTUSE,ABELTYPE,ABLTFAIL,AGE,BAGAVAIL,BAGAVRPT,BAGDEPLY,BAGEVENT,...,delta v confidence level,Body Category,CASEWGT,source,COMORBOTH,OSTEOCOND,SPINEDEGEN,IMPAIREDCOAG,IMPLANTFUS,CARDIOCOND
0,0,0.0,0.0,0.0,0.0,38.0,1.0,1.0,7.0,97.0,...,0.0,3.0,45.291,1,0,0,0,0,0,0
1,1,0.0,0.0,0.0,0.0,65.0,1.0,2.0,1.0,1.0,...,1.0,3.0,29.968,1,0,0,0,0,0,0
2,2,0.0,0.0,0.0,0.0,43.0,1.0,1.0,1.0,1.0,...,3.0,0.0,42.901,1,0,0,0,0,0,0
3,3,0.0,0.0,0.0,0.0,49.0,1.0,2.0,1.0,1.0,...,1.0,1.0,24.948,1,0,0,0,0,0,0
4,4,0.0,0.0,0.0,0.0,30.0,1.0,1.0,1.0,1.0,...,1.0,1.0,24.948,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6741,6741,,,,,32.0,1.0,3.0,7.0,97.0,...,1.0,0.0,30.981,1,0,0,0,0,0,0
6742,6742,,,,,26.0,1.0,3.0,7.0,97.0,...,4.0,0.0,67.026,1,0,0,0,0,0,0
6743,6743,,,,,62.0,1.0,3.0,7.0,97.0,...,1.0,1.0,90.305,1,0,0,0,0,0,0
6744,6744,,,,,43.0,1.0,3.0,7.0,97.0,...,0.0,0.0,40.209,1,0,0,0,0,0,0


In [10]:
feas = set([
    'source', 'MAIS', 'Body Category', 'Model Year', 
    'premovement before collision', 'Curb Weight',
    'Maneuver before collision', 'Clock-form Direction of force',
    'Alcohol Present', 'Distracted in Driving', 'Sex', 'Age', 'height', 'Race', 
    'weight', 'Speed Limit', 'Related to Intersection',
    'Traffic Flow Situation', 'Number of lanes', 'Alignment of Road',
    'Uphill or Downhill', 'Surface Type', 'Surface Condition',
    'Lighting Condition', 'Climate', 'Traffic Condition', 'Pre-event Location',
    'Traffic Conrtol Functioning', 'Crash Type', 'month', 'year', 'Day in Week', 'CASEWGT',
    'delta v confidence level', 'delta v'
])

feas.update(medical_records)
feas

{'Age',
 'Alcohol Present',
 'Alignment of Road',
 'Body Category',
 'CARDIOCOND',
 'CASEWGT',
 'COMORBOTH',
 'Climate',
 'Clock-form Direction of force',
 'Crash Type',
 'Curb Weight',
 'Day in Week',
 'Distracted in Driving',
 'IMPAIREDCOAG',
 'IMPLANTFUS',
 'Lighting Condition',
 'MAIS',
 'Maneuver before collision',
 'Model Year',
 'Number of lanes',
 'OSTEOCOND',
 'Pre-event Location',
 'Race',
 'Related to Intersection',
 'SPINEDEGEN',
 'Sex',
 'Speed Limit',
 'Surface Condition',
 'Surface Type',
 'Traffic Condition',
 'Traffic Conrtol Functioning',
 'Traffic Flow Situation',
 'Uphill or Downhill',
 'delta v',
 'delta v confidence level',
 'height',
 'month',
 'premovement before collision',
 'source',
 'weight',
 'year'}

In [11]:
cmb = pd.concat([nass, ciss]).drop(labels=['Unnamed: 0'], axis=1)
cmb = cmb[list(feas)]
cmb = cmb.reset_index(drop=True)

In [12]:
cmb.columns.values

array(['Model Year', 'Uphill or Downhill', 'Related to Intersection',
       'Maneuver before collision', 'Clock-form Direction of force',
       'Day in Week', 'Race', 'IMPLANTFUS', 'year', 'SPINEDEGEN',
       'Number of lanes', 'Body Category', 'Alignment of Road',
       'OSTEOCOND', 'Curb Weight', 'IMPAIREDCOAG', 'source',
       'Distracted in Driving', 'Age', 'delta v confidence level',
       'Traffic Conrtol Functioning', 'weight', 'Pre-event Location',
       'Lighting Condition', 'Climate', 'COMORBOTH', 'Traffic Condition',
       'Traffic Flow Situation', 'premovement before collision', 'month',
       'delta v', 'MAIS', 'Alcohol Present', 'CARDIOCOND', 'Surface Type',
       'Crash Type', 'height', 'CASEWGT', 'Sex', 'Surface Condition',
       'Speed Limit'], dtype=object)

In [14]:
cmb['CASEWGT']

0          45.291000
1          29.968000
2          42.901000
3          24.948000
4          24.948000
            ...     
12021     321.810619
12022     644.040510
12023    4911.585968
12024     994.960746
12025    3380.364729
Name: CASEWGT, Length: 12026, dtype: float64

# 数据类型转换：整型int

处理： replace(np.nan, 65536) -- 换为无重和数！

In [15]:
cmb_change = cmb.replace(np.nan, 65536)
cmb_change = cmb_change.astype(int)

# MAIS三分为InjurySeverity：无伤（0），轻伤（1-2），重伤（3+）

In [16]:
def change_mais(data: pd.DataFrame):
    data['InjurySeverity'] = data['MAIS']
    
    for idx in data.index:
        if data.loc[idx, 'MAIS'] >= 3:
            data.loc[idx, 'InjurySeverity'] = 2  # 'Severe Injury'
        elif 0 < data.loc[idx, 'MAIS'] < 3:
            data.loc[idx, 'InjurySeverity'] = 1  # 'Slight Injury'
        else:
            data.loc[idx, 'InjurySeverity'] = 0  # 'No Injury'
            
    return data


In [17]:
cmb_change = change_mais(cmb_change)


In [18]:
print(cmb_change['InjurySeverity'].value_counts(), len(cmb_change[cmb_change['MAIS'] > 2]))
cmb_change = cmb_change.drop(columns=['MAIS'])

InjurySeverity
1    6582
0    4501
2     943
Name: count, dtype: int64 943


# 保存数据

In [19]:
cmb_change

Unnamed: 0,Model Year,Uphill or Downhill,Related to Intersection,Maneuver before collision,Clock-form Direction of force,Day in Week,Race,IMPLANTFUS,year,SPINEDEGEN,...,Alcohol Present,CARDIOCOND,Surface Type,Crash Type,height,CASEWGT,Sex,Surface Condition,Speed Limit,InjurySeverity
0,2008,1,3,1,170,4,1,0,2009,0,...,0,0,2,1,170,45,2,1,89,1
1,2003,1,2,1,350,7,1,0,2009,0,...,0,0,2,4,163,29,2,1,56,1
2,2005,4,0,2,350,1,1,0,2009,0,...,0,0,2,6,157,42,2,3,89,1
3,2005,2,3,1,30,5,1,0,2009,0,...,0,0,2,4,180,24,1,1,89,1
4,2004,1,2,65536,350,7,1,0,2009,0,...,0,0,2,4,168,24,2,1,72,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12021,2018,1,2,4,10,5,1,0,2022,0,...,0,0,2,4,178,321,2,2,97,1
12022,2019,4,2,2,350,4,3,65536,2022,65536,...,0,65536,2,4,188,644,1,1,56,0
12023,2020,1,2,9,30,7,1,65536,2022,65536,...,0,65536,2,4,157,4911,2,1,56,0
12024,2022,1,2,3,350,6,1,65536,2022,65536,...,0,65536,2,4,185,994,1,1,72,0


In [21]:
cmb_change.to_csv(os.path.join(os.curdir,
                          'Combined/',
                          'CombineNassCiss.csv'), encoding='utf-8')

# 特征值离散化

In [22]:
import pandas as pd
import numpy as np
import os, datetime, math

In [23]:
cmb_path = os.path.join(os.path.dirname(os.curdir), 'Combined/',
                         'CombineNassCiss.csv')

file = pd.read_csv(cmb_path)
copyfile = file.copy(deep=True)

In [24]:
file

Unnamed: 0.1,Unnamed: 0,Model Year,Uphill or Downhill,Related to Intersection,Maneuver before collision,Clock-form Direction of force,Day in Week,Race,IMPLANTFUS,year,...,Alcohol Present,CARDIOCOND,Surface Type,Crash Type,height,CASEWGT,Sex,Surface Condition,Speed Limit,InjurySeverity
0,0,2008,1,3,1,170,4,1,0,2009,...,0,0,2,1,170,45,2,1,89,1
1,1,2003,1,2,1,350,7,1,0,2009,...,0,0,2,4,163,29,2,1,56,1
2,2,2005,4,0,2,350,1,1,0,2009,...,0,0,2,6,157,42,2,3,89,1
3,3,2005,2,3,1,30,5,1,0,2009,...,0,0,2,4,180,24,1,1,89,1
4,4,2004,1,2,65536,350,7,1,0,2009,...,0,0,2,4,168,24,2,1,72,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12021,12021,2018,1,2,4,10,5,1,0,2022,...,0,0,2,4,178,321,2,2,97,1
12022,12022,2019,4,2,2,350,4,3,65536,2022,...,0,65536,2,4,188,644,1,1,56,0
12023,12023,2020,1,2,9,30,7,1,65536,2022,...,0,65536,2,4,157,4911,2,1,56,0
12024,12024,2022,1,2,3,350,6,1,65536,2022,...,0,65536,2,4,185,994,1,1,72,0


## Change Distracted

In [25]:
def change_distracted(data: pd.DataFrame):
    data['Distracted'] = data['Distracted in Driving']
    
    for idx in data.index:
        if data.loc[idx, 'Distracted in Driving'] not in (1, 65536):
            data.loc[idx, 'Distracted'] = 2  #  Distracted
            
    return data

change_distracted(copyfile)
copyfile['Distracted'].value_counts()

Distracted
1        7194
65536    2481
2        2351
Name: count, dtype: int64

## Change SPLIMIT 0 --> INF

In [26]:
copyfile.loc[copyfile['Speed Limit'] == 0, 'Speed Limit'] = max(copyfile['Speed Limit'])
copyfile['Speed Limit'].value_counts()

Speed Limit
56     2643
72     2427
40     1434
48     1376
64     1301
89     1214
80      622
105     373
97      240
113     223
129      81
24       32
32       29
121      22
16        5
8         4
Name: count, dtype: int64

## Change Climate

In [27]:
# nass 18: clear == ciss: 1
copyfile.loc[copyfile['Climate'] == 18, 'Climate'] = 1

# nass 19: cloudy == ciss: 8
copyfile.loc[copyfile['Climate'] == 19, 'Climate'] = 8

# nass 12: rain == ciss: 2
copyfile.loc[copyfile['Climate'] == 12, 'Climate'] = 2

# nass 14: snow == ciss: 4
copyfile.loc[copyfile['Climate'] == 14, 'Climate'] = 4

# nass 15: blowing snow == ciss: 9
copyfile.loc[copyfile['Climate'] == 15, 'Climate'] = 9

# nass 11: fog == ciss: 5
copyfile.loc[copyfile['Climate'] == 11, 'Climate'] = 5

# nass 13 & 20: sleet-hail == ciss: 3
copyfile.loc[copyfile['Climate'] == 13, 'Climate'] = 3
copyfile.loc[copyfile['Climate'] == 20, 'Climate'] = 3

# nass 16: crosswinds == ciss: 6
copyfile.loc[copyfile['Climate'] == 16, 'Climate'] = 6

# nass 17: boiling sand == ciss: 7
copyfile.loc[copyfile['Climate'] == 17, 'Climate'] = 7

# nass 21: Freezing Rain or Freezing Drizzle == ciss: 10
copyfile.loc[copyfile['Climate'] == 21, 'Climate'] = 10

copyfile['Climate'].value_counts()

Climate
1     9074
8     1537
2     1053
4      202
9       65
5       63
3       14
10      11
6        6
7        1
Name: count, dtype: int64

In [28]:
copyfile = copyfile.drop(columns=['Unnamed: 0', 'Distracted in Driving'])
copyfile = copyfile.rename(columns={'Distracted': 'Distracted in Driving'})

In [29]:
copyfile.columns

Index(['Model Year', 'Uphill or Downhill', 'Related to Intersection',
       'Maneuver before collision', 'Clock-form Direction of force',
       'Day in Week', 'Race', 'IMPLANTFUS', 'year', 'SPINEDEGEN',
       'Number of lanes', 'Body Category', 'Alignment of Road', 'OSTEOCOND',
       'Curb Weight', 'IMPAIREDCOAG', 'source', 'Age',
       'delta v confidence level', 'Traffic Conrtol Functioning', 'weight',
       'Pre-event Location', 'Lighting Condition', 'Climate', 'COMORBOTH',
       'Traffic Condition', 'Traffic Flow Situation',
       'premovement before collision', 'month', 'delta v', 'Alcohol Present',
       'CARDIOCOND', 'Surface Type', 'Crash Type', 'height', 'CASEWGT', 'Sex',
       'Surface Condition', 'Speed Limit', 'InjurySeverity',
       'Distracted in Driving'],
      dtype='object')

In [31]:
copyfile.to_csv(os.path.join(os.curdir,
                          'Combined/',
                          'CombineNassCiss.csv'), encoding='utf-8')