# Descriptive analysis

In [1]:
%load_ext autoreload
%autoreload 2

In [63]:
import pandas as pd
import os
from pathlib import Path
import json
import numpy as np

figure_path = os.path.join(Path(os.getcwd()).parent.parent, 'figures/')
data_path = os.path.join(Path(os.getcwd()).parent.parent, 'data/')
with open(os.path.join(data_path, 'value_mapping.txt'), encoding='utf-8') as json_file:
    value_dict = json.load(json_file)

In [3]:
# Load data
df = pd.read_csv(os.path.join(data_path, 'data_s3.csv'), low_memory=False)

for var in df.columns:
    print(var, df[var].unique(), '\n')

id [     1      2      3 ... 237253 237254 237255] 

type ['11' '35' '21' 'Others' '19' 'Unknown' '29' '12' '312' '36' '223' '38'
 '34' '33' '999'] 

weather ['1' '2' '3' 'Unknown' '4' '89'] 

reason ['1094' '1043' '1225' '1103' 'Others' 'Unknown' '1074' '2006' '1313'
 '2005' '1205' '2024' '1046' '1316' '2009' '2007' '2004' '1302' '3026'
 '1042'] 

road_type ['21' '11' '29' '10' '22' 'Unknown' '27' '25' '12' '26' '14' '13'] 

gender ['1' '0' 'Unknown'] 

age ['(45, 50]' '(60, 65]' '(25, 30]' '(18, 25]' '(30, 35]' '(35, 40]'
 '(50, 55]' '(40, 45]' 'Unknown' '(55, 60]' '< 18' '> 70' '(65, 70]'] 

edu ['Unknown' '2' '3' '5' '1' '4' '0' '6'] 

respon ['0' '1' '2' '3' '4' 'Unknown' '5'] 

veh_type ['2' '4' '3' '6' '1' 'Unknown' '5'] 

travel_mode ['3' '2' '1' 'Unknown' '4'] 

injs ['= 0' '(0, 4]' '(4, 9]' '> 9'] 

deaths ['= 0' '(0, 4]' '(4, 9]'] 

weekday [3 4 5 6 7 1 2] 

hour [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23] 

injs_num [ 0  2  4  1  8  6 10  3  5 

## 1 Data summary: variables and levels
### 1.1 Summarize variables and its levels

In [4]:
# Rename columns
var_mapping = {'weekday': 'Day of week',
               'hour': 'Time of day',
               'weather': 'Weather',
               'cluster': 'Land-use cluster',
               'type': 'Crash type',
               'reason': 'Crash causation',
               'road_type': 'Road type',
               'veh_type': 'Vehicle type',
               'injs': 'Injuries (bin)',
               'deaths': 'Deaths (bin)',
               'injs_num': 'Injuries',
               'deaths_num': 'Deaths',
               'age': 'Age',
               'edu': 'Education level',
               'gender': 'Gender',
               'respon': 'Responsibility',
               'travel_mode': 'Travel method'}

df = df.rename(columns={k: v for k, v in var_mapping.items()})

In [5]:
# Extract levels
col_list = list(df.columns)
col_list.remove('id')
level_df_list = []
for var in col_list:
    level_df_list.append(pd.DataFrame([(var, x) for x in df[var].unique()], columns=['var', 'level']))
df_level = pd.concat(level_df_list)
df_level.to_clipboard(index=False)
df.iloc[0]

id                         1
Crash type                11
Weather                    1
Crash causation         1094
Road type                 21
Gender                     1
Age                 (45, 50]
Education level      Unknown
Responsibility             0
Vehicle type               2
Travel method              3
Injuries (bin)           = 0
Deaths (bin)             = 0
Day of week                3
Time of day                0
Injuries                   0
Deaths                     0
Land-use cluster           2
Name: 0, dtype: object

### 1.2 Manually define the translate of levels for each variable
The file is stored in bn_var_level_translate.xlsx.


In [6]:
## Prepare table for write-up
df_var_list = []
df_lvs = pd.read_excel(os.path.join(Path(os.getcwd()).parent.parent, 'docs/var_level_translate.xlsx'))

for var, frame in df_lvs.groupby('var'):
    ld = '; '.join([' '.join([str(row['level_index']) + ')', str(row['en'])]) for _, row in frame.sort_values(by='level_index').iterrows()])
    df_var_list.append(pd.DataFrame([(var, str(len(frame)), ld)], columns=('Variable', 'Level number', 'Level description')))
df_var_tb = pd.concat(df_var_list)
df_var_tb.to_clipboard(index=False)

## 2 Replace numerical levels with English abbreviations for easy description

In [7]:
df_lvs = df_lvs.astype(str)

In [8]:
for var in df.columns:
    if var not in ['id', 'Injuries', 'Deaths']:
        df.loc[:, var] = df.loc[:, var].astype(str)
        lvs_dict = {row['level']: row['abbr'] for _, row in df_lvs.loc[df_lvs['var'] == var, :].iterrows()}
        df.loc[:, var] = df.loc[:, var].apply(lambda x: lvs_dict[x])
df.head()

Unnamed: 0,id,Crash type,Weather,Crash causation,Road type,Gender,Age,Education level,Responsibility,Vehicle type,Travel method,Injuries (bin),Deaths (bin),Day of week,Time of day,Injuries,Deaths,Land-use cluster
0,1,CT3,Sunny,CC8,RT6,Male,"(45, 50]",Unknown,No,Bus,Driving motor vehicle,= 0,= 0,Wed,0,0,0,LUC2
1,1,CT3,Sunny,CC8,RT6,Female,"(45, 50]",Unknown,Full,Bus,Driving motor vehicle,= 0,= 0,Wed,0,0,0,LUC2
2,2,CT3,Sunny,CC2,RT2,Male,"(60, 65]",EL3,No,Motorcycle,Driving motor vehicle,= 0,= 0,Wed,0,0,0,LUC5
3,2,CT3,Sunny,CC2,RT2,Male,"(25, 30]",EL3,Full,Truck,Driving motor vehicle,= 0,= 0,Wed,0,0,0,LUC5
4,3,CT3,Sunny,CC9,RT11,Female,"(18, 25]",Unknown,Equal,Others,Driving non-motor vehicle,"(0, 4]",= 0,Wed,0,2,0,LUC2


In [9]:
# Remove duplicated records to look at the crash attributes
df_rec = df.drop_duplicates(subset=['id'])


## 3 Statistics of different crash attributes
This part of the results are summarised in docs/article_tables.xlsx.

In [10]:
print(f'Total number of crashes: {len(df_rec)}')
print(f'Total number of the involved traffic participants: {len(df)}')

Total number of crashes: 237255
Total number of the involved traffic participants: 436412


In [40]:
def basic(data):
    data = pd.merge(data, df_lvs.loc[df_lvs['var'] == var, ['en', 'abbr']],
                    left_on=var, right_on='abbr')
    num = len(data)
    injs = data['Injuries'].sum()
    deaths = data['Deaths'].sum()
    return pd.Series({'Var en': data.iloc[0]['en'],
                      'Crash number': num, 'Crash %': num / len(df_rec) * 100,
                      'Deaths number': deaths, 'Deaths %': deaths / df_rec['Deaths'].sum() * 100,
                      'Injuries number': injs, 'Injuries %': injs / df_rec['Injuries'].sum() * 100
                      })

### 3.1 Travel method (Travel method)
Impact on: number of crashes, deaths, injuries.

In [42]:
var = 'Travel method'
df_rec.groupby(var).apply(basic).to_clipboard()

### 3.2 Road type (Road type)
Impact on: number of crashes, deaths, injuries.

In [43]:
var = 'Road type'
df_rec.groupby(var).apply(basic).to_clipboard()


### 3.3 Crash type (Crash type)
Impact on: number of crashes, deaths, injuries.

In [36]:
var = 'Crash type'
df_rec.groupby(var).apply(basic).to_clipboard()


### 3.4 Weather (Weather)
Impact on: number of crashes, deaths, injuries.

In [37]:
var = 'Weather'
df_rec.groupby(var).apply(basic).to_clipboard()

#### 3.4.1 Compare with the overall weather
Look at the weather records downloaded from OpenWeather.

In [51]:
df_weather = pd.read_csv(os.path.join(data_path, 'weather_shenzhen_2014-2016.csv'),
                         usecols=['dt_iso', 'weather_main'])
df_weather.dt_iso = df_weather.dt_iso.apply(lambda x: x.split(' ')[0])
df_weather.groupby('weather_main').size()

weather_main
Clear            3100
Clouds          16244
Haze                5
Mist               22
Rain             6936
Thunderstorm        5
dtype: int64

### 3.5 Crash causation (Crash causation)
Impact on: number of crashes, deaths, injuries.

In [38]:
var = 'Crash causation'
df_rec.groupby(var).apply(basic).to_clipboard()

### 3.6 Impact of month of year, day of week, and time of day
Impact on: number of crashes.

Run scr/plot_crash_by_time.R will get the plots.

### 3.7 Land-use cluster
Impact on: number of crashes.

In [45]:
var = 'Land-use cluster'
df_rec.groupby(var).apply(basic).to_clipboard()

## 4 Statistics of different crash attributes: traffic participants
This part of the results are summarised in docs/article_tables.xlsx.
This part only focuses on the traffic participants who were driving a motor vehicle.

In [47]:
# Select motor-vehicle drivers
df = df.loc[df['Travel method'] == 'Driving motor vehicle', :]

print(f'Number of drivers involved in the crashes: {len(df)}')

Number of drivers involved in the crashes: 309194


In [56]:
tbl_head = ['Total', 'Full', 'Full%', 'Major', 'Major%', 'Equal', 'Equal%',
            'Minor', 'Minor%', 'No', 'No%', 'Unknown', 'Unknown%']

def res(data):
    mn_get = df_lvs.loc[df_lvs['var'] == 'Responsibility', ['en', 'abbr']]
    mn_get = mn_get.loc[mn_get['en'] != 'Unable to determine', :]
    data = pd.merge(data, mn_get,
                    left_on='Responsibility', right_on='abbr', how='left')
    num = data.groupby('Responsibility')['id'].size().reset_index()
    r = dict()
    r['Total'] = len(data) / len(df) * 100
    for _, row in num.iterrows():
        r[row['Responsibility']] = row['id']
        r[row['Responsibility']+ '%'] = row['id'] / len(data) * 100
    respon = pd.Series(r)
    for vr in tbl_head:
        if vr not in respon.index:
            respon[vr] = 0
    respon = respon[tbl_head]
    return respon

### 4.1 Gender of involved traffic participants
Impact on: responsibility.

In [54]:
df.groupby('Gender').apply(res).to_clipboard()

### 4.2 Age of involved traffic participants
Impact on: responsibility.

In [58]:
df.groupby('Age').apply(res).to_clipboard()

### 4.3 Education level of involved traffic participants
Impact on: responsibility.

In [59]:
df.groupby('Education level').apply(res).to_clipboard()

### 4.4 Vehicle type of involved traffic participants
Impact on: responsibility.

In [61]:
df.groupby('Vehicle type').apply(res).to_clipboard()

## 5 Prepare data for Bayesian network modelling
1 Focus on motor-vehicle drivers and their crashes like data used in the section 4 of this notebook.

2 Drop Education level because of too many missing values.

3 Merge injuries and deaths for the ease of interpretation.

4 Keep the crash records with clear information, i.e., no unknown and others fields.

In [69]:
# Drop Education level
df2bn = df.drop(columns=['Education level'])

In [70]:
# Merge injuries and deaths
var = 'Injuries and deaths'
df2bn.loc[:, var] = df2bn['Injuries'] + df2bn['Deaths']
df2bn.loc[:, var] = pd.cut(df2bn[var], bins=[-1, 0, 4, 9, 1000])
cat_dict = {'(-1, 0]': '= 0', '(9, 1000]': '> 9'}
df2bn.loc[:, var] = df2bn.loc[:, var].apply(lambda x: cat_dict[str(x)] if str(x) in cat_dict else str(x))
df2bn.loc[:, var].cat.add_categories("Unknown", inplace=True)
df2bn.loc[:, var].fillna("Unknown", inplace=True)

In [71]:
# Keep the crash records with complete information
df2bn.replace(to_replace = ['Unknown', 'Others'], value = np.nan, inplace = True)
df2bn.dropna(inplace=True)

In [72]:
# Save it
print(f'Number of crashes/drivers to construct the Bayesian network: {len(df2bn)}')
df2bn.to_csv(os.path.join(data_path, 'data_s4.csv'), index=False)

Number of crashes/drivers to construct the Bayesian network: 235901
