# Data preprocessing

Input: data/data_s1.csv

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import json
from datetime import datetime

data_path = os.path.join(Path(os.getcwd()).parent.parent, 'data/')
with open(os.path.join(data_path, 'value_mapping.txt'), encoding='utf-8') as json_file:
    value_dict = json.load(json_file)

In [3]:
# Load data
df = pd.read_csv(os.path.join(data_path, "data_s1.csv"))
# Drop the below columns for being out of interest
df.drop(columns=['road_name', 'distr', 'miss', 'acc_id'], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 436412 entries, 0 to 436411
Data columns (total 19 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   id           436412 non-null  int64  
 1   time         436412 non-null  object 
 2   type         395731 non-null  float64
 3   weather      436412 non-null  int64  
 4   reason       432503 non-null  float64
 5   lat          431092 non-null  float64
 6   long         431092 non-null  float64
 7   road_type    396141 non-null  float64
 8   death        436412 non-null  int64  
 9   death_s      436412 non-null  int64  
 10  inj_b        436412 non-null  int64  
 11  inj_l        436412 non-null  int64  
 12  inj          436412 non-null  int64  
 13  gender       352636 non-null  object 
 14  age          388715 non-null  float64
 15  edu          87774 non-null   object 
 16  respon       358072 non-null  object 
 17  veh_type     392369 non-null  object 
 18  travel_mode  392369 non-

## 1 Process gender, edu, veh_type, deaths & injuries, and time
data/value_mapping.txt

### 1.1 Prepare for veh_type mapping rules

In [4]:
df_veh_type = pd.DataFrame(list(df.veh_type.unique()), columns=['veh_type'])
df_veh_type.dropna(inplace=True)
def cate_func(x):
    if '轿车' in x:
        return 1
    elif ('货车' in x) or ('牵引' in x) or ('挂车' in x) or ('厢车' in x):
        return 3
    elif '客车' in x:
        return 2
    elif '摩托车' in x:
        return 4
    elif 'F' in x:
        return 5
    else:
        return 6

df_veh_type.loc[:, 'cate'] = df_veh_type.loc[:, 'veh_type'].apply(lambda x: cate_func(x) if x != np.nan else x)
veh_type_dict = {row['veh_type']: row['cate'] for _, row in df_veh_type.iterrows()}

### 1.2 Prepare for travel_mode mapping rules

In [5]:
df_travel_mode = pd.DataFrame(list(df.travel_mode.unique()), columns=['travel_mode'])
df_travel_mode.dropna(inplace=True)
def cate_func_m(x):
    if '步行' in x:
        return 1
    elif ('三轮车' in x) or ('自行车' in x) or ('手推车' in x) or ('电动自行车' in x) or ('其它非机动车' in x):
        return 2
    elif '其它' in x:
        return 4
    else:
        return 3
df_travel_mode.loc[:, 'cate'] = df_travel_mode.loc[:, 'travel_mode'].apply(lambda x: cate_func_m(x) if x != np.nan else x)
travel_mode_dict = {row['travel_mode']: row['cate'] for _, row in df_travel_mode.iterrows()}

### 1.3 Process gender, edu, respon, and veh_type
gender, edu, respon from externally defined mapping rules: value_dict.

In [6]:
# Driver gender
df.loc[:, 'gender'] = df.loc[:, 'gender'].apply(lambda x: str(int(value_dict['gender'][x])) if x in value_dict['gender'] else np.nan)
df.loc[:, 'gender'].fillna("Unknown", inplace=True)

# Driver edu
df.loc[:, 'edu'] = df.loc[:, 'edu'].apply(lambda x: str(int(value_dict['edu'][x])) if x in value_dict['edu'] else np.nan)
df.loc[:, 'edu'].fillna("Unknown", inplace=True)

# Driver respon
df.loc[:, 'respon'] = df.loc[:, 'respon'].apply(lambda x: str(int(value_dict['respon'][x])) if x in value_dict['respon'] else np.nan)
df.loc[:, 'respon'].fillna("Unknown", inplace=True)

# Driver vehicle type
df.loc[:, 'veh_type'] = df.loc[:, 'veh_type'].apply(lambda x: str(int(veh_type_dict[x])) if x in veh_type_dict else np.nan)
df.loc[:, 'veh_type'].fillna("Unknown", inplace=True)

# Driver vehicle type
df.loc[:, 'travel_mode'] = df.loc[:, 'travel_mode'].apply(lambda x: str(int(travel_mode_dict[x])) if x in travel_mode_dict else np.nan)
df.loc[:, 'travel_mode'].fillna("Unknown", inplace=True)

# convert_dict = {'gender': pd.Int64Dtype(), 'edu': pd.Int64Dtype(), 'veh_type': pd.Int64Dtype()}
# df = df.astype(convert_dict)
for var in ['gender', 'edu', 'veh_type', 'respon', 'travel_mode']:
    print(var, df[var].unique(), '\n')

gender ['1' '0' 'Unknown'] 

edu ['Unknown' '2' '3' '5' '1' '4' '0' '6'] 

veh_type ['2' '4' '3' '6' '1' 'Unknown' '5'] 

respon ['0' '1' '2' '3' '4' 'Unknown' '5'] 

travel_mode ['3' '2' '1' 'Unknown' '4'] 



### 1.4 Merge deaths and injuries

In [7]:
df.loc[:, 'injs'] = df.loc[:, ['inj', 'inj_b', 'inj_l']].sum(axis=1)
df.loc[:, 'deaths'] = df.loc[:, ['death', 'death_s']].sum(axis=1)
df.drop(columns=['inj', 'inj_b', 'inj_l', 'death', 'death_s'], inplace=True)
df.iloc[0:3].transpose()

Unnamed: 0,0,1,2
id,1,1,2
time,2014-01-01 00:00:00,2014-01-01 00:00:00,2014-01-01 00:00:00
type,11,11,11
weather,1,1,1
reason,1094,1094,1043
lat,22.5931,22.5931,22.7825
long,114.016,114.016,113.839
road_type,21,21,11
gender,1,0,1
age,46,47,62


### 1.5 Process time to get weekday (0 for weekend, 1 for weekday), time of day

In [8]:
df.loc[:, 'time'] = df.loc[:, 'time'].apply(lambda x: datetime.strptime(x.strip(), '%Y-%m-%d %H:%M:%S'))
df.loc[:, 'weekday'] = df.loc[:, 'time'].apply(lambda x: x.weekday() + 1)
df.loc[:, 'hour'] = df.loc[:, 'time'].apply(lambda x: x.hour)
df.drop(columns=['time'], inplace=True)
df.iloc[0:3].transpose()

Unnamed: 0,0,1,2
id,1,1,2.0
type,11,11,11.0
weather,1,1,1.0
reason,1094,1094,1043.0
lat,22.5931,22.5931,22.7825
long,114.016,114.016,113.839
road_type,21,21,11.0
gender,1,0,1.0
age,46,47,62.0
edu,Unknown,Unknown,2.0


## 2 Clean up the categories for further analysis
### 2.1 age
Break it into age groups and label missing fields as Unknown.

In [9]:
df.loc[:, 'age'] = pd.cut(df.age, bins=[0, 18, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 150])
age_cat_dict = {'(0, 18]': '< 18', '(70, 150]': '> 70'}
df.loc[:, 'age'] = df.loc[:, 'age'].apply(lambda x: age_cat_dict[str(x)] if str(x) in age_cat_dict else str(x))
df.loc[:, 'age'].cat.add_categories("Unknown", inplace=True)
df.loc[:, 'age'].fillna("Unknown", inplace=True)
print('age', df['age'].unique(), '\n')

age [(45, 50], (60, 65], (25, 30], (18, 25], (30, 35], ..., Unknown, (55, 60], < 18, > 70, (65, 70]]
Length: 13
Categories (13, object): [< 18 < (18, 25] < (25, 30] < (30, 35] ... (60, 65] < (65, 70] < > 70 < Unknown] 



### 2.2 reason
Select top reasons, set the rest as Others, and label missing fields as Unknown.

In [10]:
## Merge two categories that are essentially the same (1301, 2004 -> 2004)
df.loc[:, 'reason'] = df.loc[:, 'reason'].apply(lambda x: 2004 if x == 1301 else x)

# Check the top 20 reasons
acc_reason = df.drop_duplicates(subset=["id"]).groupby('reason')[['id']].count()/len(df.drop_duplicates(subset=["id"]))*100
acc_reason = acc_reason.reset_index().sort_values(by='id', ascending=False).rename(columns={'id': 'freq'})
acc_reason.loc[:, 'cum_freq'] = acc_reason.loc[:, 'freq'].cumsum()

In [11]:
# Set the top 18 reasons and set nan to Unknown
reason_list = [int(x) for x in acc_reason.head(20).reason if x not in (9901, 9009)]
df.loc[:, 'reason'].fillna("Unknown", inplace=True)

# Convert the rest into str and set the rest as others
df.loc[df['reason'].isin(reason_list), 'reason'] = df.loc[df['reason'].isin(reason_list), 'reason'].apply(lambda x: str(int(x)))
reason_list = [str(int(x)) for x in reason_list] + ['Unknown']
df.loc[~df['reason'].isin(reason_list), 'reason'] = 'Others'

print('reason', df['reason'].unique(), '\n')

reason ['1094' '1043' '1225' '1103' 'Others' 'Unknown' '1074' '2006' '1313'
 '2005' '1205' '2024' '1046' '1316' '2009' '2007' '2004' '1302' '3026'
 '1042'] 



### 2.3 type
1. Merge type 31 and 32 into one, coded as 312 (Rollover)

2. Merge type 22 and 23 into one, coded as 223 (Crushing pedestrians)

3. Selected types: 11, 35, 19, 21, 12, 36, 312, 223, 38, 999, 29, 34, 33. Set the rest as Others.
Missing values set to Unknown.

In [12]:
# Check the top 20 types
acc_type = df.drop_duplicates(subset=["id"]).groupby('type')[['id']].count()/len(df)*100
acc_type = acc_type.reset_index().sort_values(by='id', ascending=False).rename(columns={'id': 'freq'})
acc_type.loc[:, 'cum_freq'] = acc_type.loc[:, 'freq'].cumsum()
acc_type.loc[:, 'type_name'] = acc_type.loc[:, 'type'].apply(lambda x: value_dict['acc_type'][str(int(x))] if str(int(x)) in value_dict['acc_type'] else 'unknown')
acc_type.head(30)


Unnamed: 0,type,freq,cum_freq,type_name
0,11.0,36.139015,36.139015,碰撞运动车辆
13,35.0,4.142874,40.28189,撞固定物
3,19.0,4.044343,44.326233,其他车辆间事故
4,21.0,3.683675,48.009908,刮撞行人
1,12.0,0.481884,48.491792,碰撞静止车辆
14,36.0,0.192479,48.684271,撞非固定物
9,31.0,0.148254,48.832525,侧翻
20,70.0,0.118008,48.950533,unknown
16,38.0,0.07218,49.022712,乘员跌落或抛出
24,999.0,0.066909,49.089622,其他车辆与人事故


In [13]:
# 1 Merge type 31 and 32 into 312
df.loc[:, 'type'] = df.loc[:, 'type'].apply(lambda x: 312 if x in (31, 32) else x)

# 2 Merge type 22 and 23 into 223
df.loc[:, 'type'] = df.loc[:, 'type'].apply(lambda x: 223 if x in (22, 23) else x)

# 3 Select a few types
df.loc[:, 'type'].fillna("Unknown", inplace=True)
type_list = [11, 35, 19, 21, 12, 36, 312, 223, 38, 999, 29, 34, 33]
df.loc[df.type != 'Unknown', 'type'] = df.loc[df.type != 'Unknown', 'type'].apply(lambda x: str(int(x)) if x in type_list else "Others")

print('type', df['type'].unique(), '\n')

type ['11' '35' '21' 'Others' '19' 'Unknown' '29' '12' '312' '36' '223' '38'
 '34' '33' '999'] 



### 2.4 injs
Break it into groups and label missing fields as Unknown.

In [14]:
df.loc[:, 'injs'].min(), df.loc[:, 'injs'].max()
df.loc[:, 'injs_num'] = df.loc[:, 'injs'].copy()

In [15]:
df.loc[:, 'injs'] = pd.cut(df.injs, bins=[-1, 0, 4, 9, 100])
injs_cat_dict = {'(-1, 0]': '= 0', '(9, 100]': '> 9'}
df.loc[:, 'injs'] = df.loc[:, 'injs'].apply(lambda x: injs_cat_dict[str(x)] if str(x) in injs_cat_dict else str(x))
df.loc[:, 'injs'].cat.add_categories("Unknown", inplace=True)
df.loc[:, 'injs'].fillna("Unknown", inplace=True)

### 2.5 deaths
Break it into groups and label missing fields as Unknown.

In [16]:
df.loc[:, 'deaths'].min(), df.loc[:, 'deaths'].max()
df.loc[:, 'deaths_num'] = df.loc[:, 'deaths'].copy()

In [17]:
df.loc[:, 'deaths'] = pd.cut(df.deaths, bins=[-1, 0, 4, 9, 100])
deaths_cat_dict = {'(-1, 0]': '= 0', '(9, 100]': '> 9'}
df.loc[:, 'deaths'] = df.loc[:, 'deaths'].apply(lambda x: deaths_cat_dict[str(x)] if str(x) in deaths_cat_dict else str(x))
df.loc[:, 'deaths'].cat.add_categories("Unknown", inplace=True)
df.loc[:, 'deaths'].fillna("Unknown", inplace=True)

### 2.6 road_type

In [18]:
df.loc[:, 'road_type'].fillna("Unknown", inplace=True)
df.loc[:, 'road_type'] = df.loc[:, 'road_type'].apply(lambda x: str(int(x)) if x != "Unknown" else x)

### 2.7 weather
1. Merge 8 and 9 into 89 (Haze or fog).

2. Remove 5 and 6 because there is no way Shenzhen will observe a weather like snow and blizzard. Label them as Unknown.

In [19]:
df.loc[:, 'weather'].fillna("Unknown", inplace=True)
df.loc[:, 'weather'] = df.loc[:, 'weather'].apply(lambda x: "89" if x in [8, 9] else x)
df.loc[:, 'weather'] = df.loc[:, 'weather'].apply(lambda x: "Unknown" if x in [5, 6] else x)

df.loc[df.weather.isin([1,2,3,4]), 'weather'] = df.loc[df.weather.isin([1,2,3,4]), 'weather'].apply(lambda x: str(int(x)))

print('weather', df['weather'].unique(), '\n')

weather ['1' '2' '3' 'Unknown' '4' '89'] 



In [20]:
# Save data
df.to_csv(os.path.join(data_path, 'data_s2.csv'), index=False, encoding='utf-8-sig')