# Feature Engineering
In this notebook, we will engineer count based and indicator features from the existing features. 

In [1]:
# Ignore deprecated warning
import warnings
warnings.filterwarnings("ignore")

# Data manipulation
%matplotlib inline
import pandas as pd
import numpy as np
import time

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Set font scale and style
plt.rcParams.update({'font.size': 15})

 #  Load  data

In [2]:
df = pd.read_csv('../data/clean_data.csv')
print('Data size:', df.shape)
df.head()

Data size: (6771768, 23)


Unnamed: 0,c_year,c_mnth,c_wday,c_hour,class,c_vehs,c_conf,c_rcfg,c_wthr,c_rsur,...,v_type,v_year,p_id,p_sex,p_age,p_psn,p_isev,p_safe,p_user,c_case
0,1999,January,Monday,20.0,0,2.0,Right turn,,Clear and sunny,Icy,...,Other trucks and vans,1990.0,1.0,M,41.0,Driver,No Injury,,Motor Vehicle Driver,752
1,1999,January,Monday,20.0,0,2.0,Right turn,,Clear and sunny,Icy,...,Light Duty Vehicle,1987.0,1.0,M,19.0,Driver,No Injury,,Motor Vehicle Driver,752
2,1999,January,Monday,20.0,0,2.0,Right turn,,Clear and sunny,Icy,...,Light Duty Vehicle,1987.0,2.0,F,20.0,"Front row, right outboard",Injury,Safety device used,Motor Vehicle Passenger,752
3,1999,January,Monday,8.0,0,1.0,Hit a moving object,,"Freezing rain, sleet, hail",Snow,...,Light Duty Vehicle,1986.0,1.0,M,46.0,Driver,No Injury,,Motor Vehicle Driver,753
4,1999,January,Monday,8.0,0,1.0,Hit a moving object,,"Freezing rain, sleet, hail",Snow,...,,,1.0,M,5.0,Pedestrian,Injury,,Pedestrian,753


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6771768 entries, 0 to 6771767
Data columns (total 23 columns):
 #   Column  Dtype  
---  ------  -----  
 0   c_year  int64  
 1   c_mnth  object 
 2   c_wday  object 
 3   c_hour  float64
 4   class   int64  
 5   c_vehs  float64
 6   c_conf  object 
 7   c_rcfg  object 
 8   c_wthr  object 
 9   c_rsur  object 
 10  c_raln  object 
 11  c_traf  object 
 12  v_id    float64
 13  v_type  object 
 14  v_year  float64
 15  p_id    float64
 16  p_sex   object 
 17  p_age   float64
 18  p_psn   object 
 19  p_isev  object 
 20  p_safe  object 
 21  p_user  object 
 22  c_case  int64  
dtypes: float64(6), int64(3), object(14)
memory usage: 1.2+ GB


# 1. Day, month, and year count

In [4]:
# Day count
df_day = pd.DataFrame(df.c_wday.value_counts())
df_day.reset_index(level=0,inplace=True)
df_day.rename(columns = {'c_wday':'wday_ct',  'index':'c_wday'},inplace=True)

# Month count
df_mnth = pd.DataFrame(df.c_mnth.value_counts())
df_mnth.reset_index(level=0,inplace=True)
df_mnth.rename(columns = {'c_mnth':'mnth_ct',  'index':'c_mnth'},inplace=True)

# Year count
df_year = pd.DataFrame(df.c_year.value_counts())
df_year.reset_index(level=0,inplace=True)
df_year.rename(columns = {'c_year':'year_ct',  'index':'c_year'},inplace=True)

In [5]:
# left join to the main dataframe
df = df.merge(df_day, how = 'left', on = 'c_wday') 
df = df.merge(df_mnth, how = 'left', on = 'c_mnth')
df = df.merge(df_year, how = 'left', on = 'c_year') 

# 2. Hour count

In [6]:
# Hour count
df_hour = pd.DataFrame(df.c_hour.value_counts())
df_hour.reset_index(level=0,inplace=True)
df_hour.rename(columns = {'c_hour':'hour_ct',  'index':'c_hour'},inplace=True)

# left join to the main dataframe
df = df.merge(df_hour, how = 'left', on = 'c_hour')

# 3. Other count based features

In [7]:
# Treatment required count
df_isev = pd.DataFrame(df.p_isev.value_counts())
df_isev.reset_index(level=0,inplace=True)
df_isev.rename(columns = {'p_isev':'p_isev_ct',  'index':'p_isev'},inplace=True)

# Traffic control type
df_conf = pd.DataFrame(df.c_conf.value_counts())
df_conf.reset_index(level=0,inplace=True)
df_conf.rename(columns = {'c_conf':'conf_ct',  'index':'c_conf'},inplace=True)


# Safety device used
df_safe = pd.DataFrame(df.p_safe.value_counts())
df_safe.reset_index(level=0,inplace=True)
df_safe.rename(columns = {'p_safe':'p_safe_ct',  'index':'p_safe'},inplace=True)


# Weather
df_wthr = pd.DataFrame(df.c_wthr.value_counts())
df_wthr.reset_index(level=0,inplace=True)
df_wthr.rename(columns = {'c_wthr':'wthr_ct',  'index':'c_wthr'},inplace=True)

# Vehicle type
df_vtype = pd.DataFrame(df.v_type.value_counts())
df_vtype.reset_index(level=0,inplace=True)
df_vtype.rename(columns = {'v_type':'v_type_ct',  'index':'v_type'},inplace=True)

# road way configuration
df_rcfg = pd.DataFrame(df.c_rcfg.value_counts())
df_rcfg.reset_index(level=0,inplace=True)
df_rcfg.rename(columns = {'c_rcfg':'c_rcfg_ct',  'index':'c_rcfg'},inplace=True)

In [8]:
# left join to the main dataframe
df = df.merge(df_isev, how = 'left', on = 'p_isev')
df = df.merge(df_conf, how = 'left', on = 'c_conf') 
df = df.merge(df_safe, how = 'left', on = 'p_safe') 
df = df.merge(df_wthr, how = 'left', on = 'c_wthr') 
df = df.merge(df_vtype, how = 'left', on = 'v_type') 
df = df.merge(df_rcfg, how = 'left', on = 'c_rcfg') 

In [9]:
# Averages
df['hour_ct_per_day_ct'] = df.hour_ct/df.wday_ct
df['hour_ct_per_month_ct'] = df.hour_ct/df.mnth_ct
df['pisev_ct_per_hour_ct'] = df.p_isev_ct/df.hour_ct
df['conf_ct_per_hour_ct'] = df.conf_ct/df.hour_ct
df['psafe_ct_per_hour_ct'] = df.p_safe_ct/df.hour_ct
df['wthr_ct_per_hour_ct'] = df.wthr_ct/df.hour_ct
df['vtype_ct_per_hour_ct'] = df.v_type_ct/df.hour_ct
df['rcfg_ct_per_hour_ct'] = df.c_rcfg_ct/df.hour_ct

# Indicators and odd ratio

In [10]:
def _odd(df, col1, col2):
    """
    calculate the fatality rate - that is the ratio of the probability 
    of a fatal collision over the probability of a non-fatal collision

    Parameters
    ----------
    df: pandas dataframe
    col1: categorical column with two levels only
    col2: class label with two classes only

    Returns
    ----------
    pivoted dataframe
    """
    pvt = pd.crosstab(df[col1], df[col2])
    pvt['odds'] = pvt[1]/pvt[0]
    pvt['odds ratio'] = pvt.odds['Y']/pvt.odds['N']
    pvt['odds ratio'].loc['N'] = ''
    return pvt

## Head-on collision or not
This indicator suggests that head-on collision is 9 times likely to lead to fatal collisions than other collision configurations combined

In [11]:
df['c_conf_ind'] = np.where(df.c_conf == 'Head-on collision', 'Y', 'N')
_odd(df, 'c_conf_ind', 'class')

class,0,1,odds,odds ratio
c_conf_ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
N,6462783,87313,0.01351,
Y,196510,25162,0.128044,9.47766


# Save as csv

In [12]:
df.to_csv('../data/feat_engr_data.csv', index = False)