# Feature Engineering
In this notebook, we will engineer count based features based on existing features. We note that the number of features which can be engineered through count is very large

In [1]:
# Ignore deprecated warning
import warnings
warnings.filterwarnings("ignore")

# Data manipulation
%matplotlib inline
import pandas as pd
import numpy as np
import time

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Set font scale and style
plt.rcParams.update({'font.size': 15})

#  Load  data

In [2]:
df = pd.read_csv('../data/clean_data.csv') 
df.drop(['longitude', 'latitude'], axis =1, inplace=True)
print('Data size:', df.shape)
df.head()

Data size: (89990, 14)


Unnamed: 0,x,y,environment,light,surface_condition,traffic_control,traffic_control_condition,class,impact_type,no_of_pedestrians,c_year,c_month,c_day,c_hour
0,357144.875,5020503.5,01 - Clear,07 - Dark,01 - Dry,10 - No control,,Non-fatal injury,02 - Angle,0,2013,January,Friday,23
1,356860.1875,5013034.5,05 - Drifting Snow,01 - Daylight,06 - Ice,10 - No control,,P.D. only,01 - Approaching,0,2013,January,Sunday,18
2,368589.71875,5029516.5,03 - Snow,01 - Daylight,06 - Ice,02 - Stop sign,01 - Functioning,P.D. only,02 - Angle,0,2013,January,Thursday,21
3,370292.90625,5035187.0,01 - Clear,01 - Daylight,02 - Wet,02 - Stop sign,01 - Functioning,P.D. only,02 - Angle,0,2013,January,Saturday,21
4,372133.1875,5032130.5,02 - Rain,07 - Dark,02 - Wet,10 - No control,,P.D. only,06 - SMV unattended vehicle,0,2013,January,Friday,4


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89990 entries, 0 to 89989
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   x                          89990 non-null  float64
 1   y                          89990 non-null  float64
 2   environment                89989 non-null  object 
 3   light                      89990 non-null  object 
 4   surface_condition          89990 non-null  object 
 5   traffic_control            89954 non-null  object 
 6   traffic_control_condition  46084 non-null  object 
 7   class                      89990 non-null  object 
 8   impact_type                89990 non-null  object 
 9   no_of_pedestrians          89990 non-null  int64  
 10  c_year                     89990 non-null  int64  
 11  c_month                    89990 non-null  object 
 12  c_day                      89990 non-null  object 
 13  c_hour                     89990 non-null  int

In [4]:
# Create binary class
map_dict = {'Non-fatal injury':0, 'P.D. only':0, 'Fatal injury':1}
df['class'].replace(map_dict, inplace=True)

# 1. Day, month, and year counts

In [5]:
# Day count
df_day = pd.DataFrame(df.c_day.value_counts())
df_day.reset_index(level=0,inplace=True)
df_day.rename(columns = {'c_day':'day_ct',  'index':'c_day'},inplace=True)

# Month count
df_mnth = pd.DataFrame(df.c_month.value_counts())
df_mnth.reset_index(level=0,inplace=True)
df_mnth.rename(columns = {'c_month':'month_ct',  'index':'c_month'},inplace=True)

# Year count
df_year = pd.DataFrame(df.c_year.value_counts())
df_year.reset_index(level=0,inplace=True)
df_year.rename(columns = {'c_year':'year_ct',  'index':'c_year'},inplace=True)

In [6]:
# left join to the main dataframe
df = df.merge(df_day, how = 'left', on = 'c_day') 
df = df.merge(df_mnth, how = 'left', on = 'c_month')
df = df.merge(df_year, how = 'left', on = 'c_year') 

# 2. Hour, impact type, environment, surface condition counts

In [7]:
# Hour count
df_hour = pd.DataFrame(df.c_hour.value_counts())
df_hour.reset_index(level=0,inplace=True)
df_hour.rename(columns = {'c_hour':'hour_ct',  'index':'c_hour'},inplace=True)

# Impact type count
df_impact = pd.DataFrame(df.impact_type.value_counts())
df_impact.reset_index(level=0,inplace=True)
df_impact.rename(columns = {'impact_type':'impact_type_ct',  'index':'impact_type'},inplace=True)

# Environment count
df_env = pd.DataFrame(df.environment.value_counts())
df_env.reset_index(level=0,inplace=True)
df_env.rename(columns = {'environment':'envmt_ct',  'index':'environment'},inplace=True)

# Surface condition count
df_scond = pd.DataFrame(df.surface_condition.value_counts())
df_scond.reset_index(level=0,inplace=True)
df_scond.rename(columns = {'surface_condition':'scond_ct',  'index':'surface_condition'},inplace=True)

In [8]:
# left join to the main dataframe
df = df.merge(df_hour, how = 'left', on = 'c_hour')
df = df.merge(df_impact, how = 'left', on = 'impact_type') 
df = df.merge(df_env, how = 'left', on = 'environment') 
df = df.merge(df_scond, how = 'left', on = 'surface_condition') 

In [9]:
# Average features
df['hr_per_day'] =  df.hour_ct/df.day_ct
df['impact_per_hour'] =  df.impact_type_ct/df.hour_ct
df['impact_per_day'] =  df.impact_type_ct/df.day_ct
df['envmt_per_hour'] =  df.envmt_ct/df.hour_ct
df['surcond_per_hour'] =  df.scond_ct/df.hour_ct

# Indicators and odd ratio

In [10]:
def _odd(df, col1, col2):
    """
    calability of a non-fatal collision

    Parameters
    ----------
    df: pandas dataframe
    col1: categorical column with two levels only
    col2: class label with two classes only

    Returns
    ----------
    pivoted dataframe
    """
    pvt = pd.crosstab(df[col1], df[col2])
    pvt['odds'] = pvt[1]/pvt[0]
    pvt['odds ratio'] = pvt.odds['Y']/pvt.odds['N']
    pvt['odds ratio'].loc['N'] = ''
    return pvt

In [11]:
df['enviro_ind'] = np.where(df.environment == '99 - Other', 'Y', 'N')
_odd(df, 'enviro_ind', 'class')

class,0,1,odds,odds ratio
enviro_ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
N,89829,143,0.001592,
Y,17,1,0.058824,36.95146


In [12]:
df['impact_ind'] = np.where(df.impact_type == '01 - Approaching', 'Y', 'N')
_odd(df, 'impact_ind', 'class')

class,0,1,odds,odds ratio
impact_ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
N,88535,133,0.001502,
Y,1311,11,0.008391,5.585388


In [13]:
df['traf_cont_ind'] = np.where(df.traffic_control == '08 - Traffic gate', 'Y', 'N')
_odd(df, 'traf_cont_ind', 'class')

class,0,1,odds,odds ratio
traf_cont_ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
N,89816,143,0.001592,
Y,30,1,0.033333,20.936131


In [14]:
df['sur_cont_ind'] = np.where(df.surface_condition == '00 - Unknown', 'Y', 'N')
_odd(df, 'sur_cont_ind', 'class')

class,0,1,odds,odds ratio
sur_cont_ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
N,89741,142,0.001582,
Y,105,2,0.019048,12.037693


In [15]:
df['light_ind'] = np.where(df.light == '03 - Dawn', 'Y',
                            np.where(df.light == '07 - Dark', 'Y', 'N')
                            )
_odd(df, 'light_ind', 'class')

class,0,1,odds,odds ratio
light_ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
N,67501,88,0.001304,
Y,22345,56,0.002506,1.922362


# Save as csv

In [16]:
df.to_csv('../data/feat_engr_data.csv', index= False)