# Feature Engineering
In this notebook, we will engineer count based features from the existing features. 

In [1]:
# Ignore deprecated warning
import warnings
warnings.filterwarnings("ignore")

# Data manipulation
%matplotlib inline
import pandas as pd
import numpy as np
import time

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Set font scale and style
plt.rcParams.update({'font.size': 15})

 #  Load  data

In [2]:
df = pd.read_csv('../data/clean_data.csv')
print('Data size:', df.shape)
df.head()

Data size (3817613, 23)


Unnamed: 0,c_year,c_mnth,c_wday,c_hour,class,c_vehs,c_conf,c_rcfg,c_wthr,c_rsur,...,v_type,v_year,p_id,p_sex,p_age,p_psn,p_isev,p_safe,p_user,c_case
0,1999,January,Monday,9.0,0,2.0,Right turn,At an intersection,Clear and sunny,"Dry, normal",...,Light Duty Vehicle,1992.0,1.0,F,33.0,Driver,Injury,Safety device used,Motor Vehicle Driver,2890
1,1999,January,Monday,9.0,0,2.0,Right turn,At an intersection,Clear and sunny,"Dry, normal",...,Light Duty Vehicle,1992.0,1.0,F,70.0,Driver,No Injury,Safety device used,Motor Vehicle Driver,2890
2,1999,January,Monday,20.0,0,1.0,Ran off left shoulder,Intersection with parking lot entrance,Clear and sunny,"Dry, normal",...,Light Duty Vehicle,1988.0,1.0,F,38.0,Driver,Injury,Safety device used,Motor Vehicle Driver,4332
3,1999,January,Monday,5.0,0,2.0,Hit a moving object,At an intersection,Raining,Wet,...,Other trucks and vans,1995.0,1.0,M,34.0,Driver,No Injury,Safety device used,Motor Vehicle Driver,5053
4,1999,January,Monday,5.0,0,2.0,Hit a moving object,At an intersection,Raining,Wet,...,Other trucks and vans,1995.0,2.0,M,30.0,"Front row, right outboard",No Injury,Safety device used,Motor Vehicle Passenger,5053


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3817613 entries, 0 to 3817612
Data columns (total 23 columns):
 #   Column  Dtype  
---  ------  -----  
 0   c_year  int64  
 1   c_mnth  object 
 2   c_wday  object 
 3   c_hour  float64
 4   class   int64  
 5   c_vehs  float64
 6   c_conf  object 
 7   c_rcfg  object 
 8   c_wthr  object 
 9   c_rsur  object 
 10  c_raln  object 
 11  c_traf  object 
 12  v_id    float64
 13  v_type  object 
 14  v_year  float64
 15  p_id    float64
 16  p_sex   object 
 17  p_age   float64
 18  p_psn   object 
 19  p_isev  object 
 20  p_safe  object 
 21  p_user  object 
 22  c_case  int64  
dtypes: float64(6), int64(3), object(14)
memory usage: 669.9+ MB


# 1. Day, month, and year counts

In [4]:
# Day count
df_day = pd.DataFrame(df.c_wday.value_counts())
df_day.reset_index(level=0,inplace=True)
df_day.rename(columns = {'c_wday':'wday_ct',  'index':'c_wday'},inplace=True)

# Month count
df_mnth = pd.DataFrame(df.c_mnth.value_counts())
df_mnth.reset_index(level=0,inplace=True)
df_mnth.rename(columns = {'c_mnth':'mnth_ct',  'index':'c_mnth'},inplace=True)

# Year count
df_year = pd.DataFrame(df.c_year.value_counts())
df_year.reset_index(level=0,inplace=True)
df_year.rename(columns = {'c_year':'year_ct',  'index':'c_year'},inplace=True)

In [5]:
# left join to the main dataframe
df = df.merge(df_day, how = 'left', on = 'c_wday') 
df = df.merge(df_mnth, how = 'left', on = 'c_mnth')
df = df.merge(df_year, how = 'left', on = 'c_year') 

# 2. Hour counts

In [6]:
# Hour count
df_hour = pd.DataFrame(df.c_hour.value_counts())
df_hour.reset_index(level=0,inplace=True)
df_hour.rename(columns = {'c_hour':'hour_ct',  'index':'c_hour'},inplace=True)

# left join to the main dataframe
df = df.merge(df_hour, how = 'left', on = 'c_hour')

# Average features
df['hr_per_day'] =  df.hour_ct/df.wday_ct

# 3. Treatment and configuration counts

In [7]:
# Treatment required count
df_isev = pd.DataFrame(df.p_isev.value_counts())
df_isev.reset_index(level=0,inplace=True)
df_isev.rename(columns = {'p_isev':'p_isev_ct',  'index':'p_isev'},inplace=True)

# Traffic control type
df_conf = pd.DataFrame(df.c_conf.value_counts())
df_conf.reset_index(level=0,inplace=True)
df_conf.rename(columns = {'c_conf':'conf_ct',  'index':'c_conf'},inplace=True)

In [8]:
# left join to the main dataframe
df = df.merge(df_isev, how = 'left', on = 'p_isev')
df = df.merge(df_conf, how = 'left', on = 'c_conf') 

In [9]:
# Average features
df['p_isev_per_day'] =  df.p_isev_ct/df.wday_ct
df['p_isev_per_hour'] =  df.p_isev_ct/df.hour_ct
df['conf_per_day'] =  df.conf_ct/df.wday_ct
df['conf_per_hour'] =  df.conf_ct/df.hour_ct

# Indicators and odd ratio

In [10]:
def _odd(df, col1, col2):
    """
    calculate the fatality rate - that is the ratio of the probability 
    of a fatal collision over the probability of a non-fatal collision

    Parameters
    ----------
    df: pandas dataframe
    col1: categorical column with two levels only
    col2: class label with two classes only

    Returns
    ----------
    pivoted dataframe
    """
    pvt = pd.crosstab(df[col1], df[col2])
    pvt['odds'] = pvt[1]/pvt[0]
    pvt['odds ratio'] = pvt.odds['Y']/pvt.odds['N']
    pvt['odds ratio'].loc['N'] = ''
    return pvt

In [11]:
df['c_conf_ind'] = np.where(df.c_conf == 'Head-on collision', 'Y', 'N')
_odd(df, 'c_conf_ind', 'class')

class,0,1,odds,odds ratio
c_conf_ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
N,3629016,44399,0.012234,
Y,127288,16910,0.132848,10.8586


In [19]:
df['c_traf_ind'] = np.where(df.c_traf == 'Reduced speed zone', 'Y', 'N')
_odd(df, 'c_traf_ind', 'class')

class,0,1,odds,odds ratio
c_traf_ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
N,3755057,61186,0.016294,
Y,1247,123,0.098637,6.05345


In [12]:
df['c_rcfg_ind'] = np.where(df.c_rcfg == 'Passing or climbing lane', 'Y', 'N')
_odd(df, 'c_rcfg_ind', 'class')

class,0,1,odds,odds ratio
c_rcfg_ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
N,3755986,61263,0.016311,
Y,318,46,0.144654,8.86863


In [13]:
df['v_type_ind'] = np.where(df.v_type == 'Road tractor', 'Y', 'N')
_odd(df, 'v_type_ind', 'class')

class,0,1,odds,odds ratio
v_type_ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
N,3717265,58258,0.015672,
Y,39039,3051,0.078153,4.98668


In [14]:
df['p_psn_ind'] = np.where(df.p_psn == 'Position unknown', 'Y', 'N')
_odd(df, 'p_psn_ind', 'class')

class,0,1,odds,odds ratio
p_psn_ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
N,3742497,60392,0.016137,
Y,13807,917,0.066416,4.11578


In [15]:
df['c_wthr_ind'] = np.where(df.c_wthr == 'Visibility limitation', 'Y', 'N')
_odd(df, 'c_wthr_ind', 'class')

class,0,1,odds,odds ratio
c_wthr_ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
N,3702922,59083,0.015956,
Y,53382,2226,0.041699,2.61344


In [16]:
df['c_raln_ind'] = np.where(df.c_raln == 'Curved with gradient', 'Y', 'N')
_odd(df, 'c_raln_ind', 'class')

class,0,1,odds,odds ratio
c_raln_ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
N,3617373,55641,0.015382,
Y,138931,5668,0.040797,2.65234


In [17]:
df['c_rsur_ind'] = np.where(df.c_rsur == 'Sand/gravel/dirt', 'Y', 'N')
_odd(df, 'c_rsur_ind', 'class')

class,0,1,odds,odds ratio
c_rsur_ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
N,3740610,60579,0.016195,
Y,15694,730,0.046515,2.87217


# Save as csv

In [23]:
df.to_csv('../data/feat_engr_data.csv', index= False)