# Feature Engineering
In this notebook, we will engineer count based and indicator features from the existing features. 

In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 15})

 #  Load  data

In [2]:
df = pd.read_csv('../data/clean_data.csv')
print('Data size:', df.shape)
df.head()

Data size: (6771768, 23)


Unnamed: 0,c_case,c_year,c_mnth,c_wday,c_hour,c_vehs,c_conf,c_rcfg,c_wthr,c_rsur,...,v_type,v_year,p_id,p_sex,p_age,p_psn,p_isev,p_safe,p_user,class
0,752,1999,January,Monday,20.0,2.0,Right turn,,Clear and sunny,Icy,...,Other trucks and vans,1990.0,1.0,M,41.0,Driver,No Injury,,Motor Vehicle Driver,0
1,752,1999,January,Monday,20.0,2.0,Right turn,,Clear and sunny,Icy,...,Light Duty Vehicle,1987.0,1.0,M,19.0,Driver,No Injury,,Motor Vehicle Driver,0
2,752,1999,January,Monday,20.0,2.0,Right turn,,Clear and sunny,Icy,...,Light Duty Vehicle,1987.0,2.0,F,20.0,"Front row, right outboard",Injury,Safety device used,Motor Vehicle Passenger,0
3,753,1999,January,Monday,8.0,1.0,Hit a moving object,,"Freezing rain, sleet, hail",Snow,...,Light Duty Vehicle,1986.0,1.0,M,46.0,Driver,No Injury,,Motor Vehicle Driver,0
4,753,1999,January,Monday,8.0,1.0,Hit a moving object,,"Freezing rain, sleet, hail",Snow,...,,,1.0,M,5.0,Pedestrian,Injury,,Pedestrian,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6771768 entries, 0 to 6771767
Data columns (total 23 columns):
 #   Column  Dtype  
---  ------  -----  
 0   c_case  int64  
 1   c_year  int64  
 2   c_mnth  object 
 3   c_wday  object 
 4   c_hour  float64
 5   c_vehs  float64
 6   c_conf  object 
 7   c_rcfg  object 
 8   c_wthr  object 
 9   c_rsur  object 
 10  c_raln  object 
 11  c_traf  object 
 12  v_id    float64
 13  v_type  object 
 14  v_year  float64
 15  p_id    float64
 16  p_sex   object 
 17  p_age   float64
 18  p_psn   object 
 19  p_isev  object 
 20  p_safe  object 
 21  p_user  object 
 22  class   int64  
dtypes: float64(6), int64(3), object(14)
memory usage: 1.2+ GB


# Count-based features

In [4]:
df_count = df.groupby('c_case').agg(
    {
        'p_isev' : 'count',
        'p_age'  : 'mean',
    }
)
df_count.reset_index(level=0, inplace=True)
df_count.rename(columns = 
                { 
                    'p_isev' : 'count_p_isev',
                    'p_age'  : 'avg_p_age',
                }
                , inplace=True
               )
df_count.head()

Unnamed: 0,c_case,count_p_isev,avg_p_age
0,1,2,48.5
1,2,2,31.5
2,3,1,27.0
3,4,2,42.0
4,5,2,16.0


In [5]:
# join to the main dataframe
df = df.merge(df_count, on='c_case')

# Indicators and odd ratio

In [6]:
def odds_ratio(df, col1, col2):
    """
    calculate the fatality rate - that is the ratio of the probability 
    of a fatal collision over the probability of a non-fatal collision

    Parameters
    ----------
    df: pandas dataframe
    col1: categorical column with two levels only
    col2: class label with two classes only

    Returns
    ----------
    pivoted dataframe
    """
    pvt = pd.crosstab(df[col1], df[col2])
    pvt['odds'] = pvt[1]/pvt[0]
    pvt['odds ratio'] = pvt.odds['Y']/pvt.odds['N']
    pvt['odds ratio'].loc['N'] = ''
    return pvt

## Head-on collision or not
This indicator suggests that head-on collision is 9 times likely to lead to fatal collisions than other collision configurations combined

In [7]:
df['c_conf_ind'] = np.where(df.c_conf == 'Head-on collision', 'Y', 'N')
odds_ratio(df, 'c_conf_ind', 'class')

class,0,1,odds,odds ratio
c_conf_ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
N,6462783,87313,0.01351,
Y,196510,25162,0.128044,9.477661


# Save as csv

In [8]:
df.to_csv('../data/feat_engr_data.csv', index = False)