# Feature Engineering
In this notebook, we will engineer count based features based on existing features. We note that the number of features which can be engineered through count is very large

In [1]:
# Ignore deprecated warning
import warnings
warnings.filterwarnings("ignore")

# Data manipulation
%matplotlib inline
import pandas as pd
import numpy as np
import time

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../data/clean_data.csv') #  Load  data
print('Data size',df.shape)
df.head()

Data size (3817613, 23)


Unnamed: 0,c_year,c_mnth,c_wday,c_hour,class,c_vehs,c_conf,c_rcfg,c_wthr,c_rsur,...,v_type,v_year,p_id,p_sex,p_age,p_psn,p_isev,p_safe,p_user,c_case
0,1999,January,Monday,9.0,0,2.0,Right turn,At an intersection,Clear and sunny,"Dry, normal",...,Light Duty Vehicle,1992.0,1.0,F,33.0,Driver,Injury,Safety device used,Motor Vehicle Driver,2890
1,1999,January,Monday,9.0,0,2.0,Right turn,At an intersection,Clear and sunny,"Dry, normal",...,Light Duty Vehicle,1992.0,1.0,F,70.0,Driver,No Injury,Safety device used,Motor Vehicle Driver,2890
2,1999,January,Monday,20.0,0,1.0,Ran off left shoulder,Intersection with parking lot entrance,Clear and sunny,"Dry, normal",...,Light Duty Vehicle,1988.0,1.0,F,38.0,Driver,Injury,Safety device used,Motor Vehicle Driver,4332
3,1999,January,Monday,5.0,0,2.0,Hit a moving object,At an intersection,Raining,Wet,...,Other trucks and vans,1995.0,1.0,M,34.0,Driver,No Injury,Safety device used,Motor Vehicle Driver,5053
4,1999,January,Monday,5.0,0,2.0,Hit a moving object,At an intersection,Raining,Wet,...,Other trucks and vans,1995.0,2.0,M,30.0,"Front row, right outboard",No Injury,Safety device used,Motor Vehicle Passenger,5053


# 1. Day, month, and year counts

In [3]:
# Day count
df_day = pd.DataFrame(df.c_wday.value_counts())
df_day.reset_index(level=0,inplace=True)
df_day.rename(columns = {'c_wday':'wday_ct',  'index':'c_wday'},inplace=True)

# Month count
df_mnth = pd.DataFrame(df.c_mnth.value_counts())
df_mnth.reset_index(level=0,inplace=True)
df_mnth.rename(columns = {'c_mnth':'mnth_ct',  'index':'c_mnth'},inplace=True)

# Year count
df_year = pd.DataFrame(df.c_year.value_counts())
df_year.reset_index(level=0,inplace=True)
df_year.rename(columns = {'c_year':'year_ct',  'index':'c_year'},inplace=True)

In [4]:
# left join to the main dataframe
df = df.merge(df_day, how = 'left', on = 'c_wday') 
df = df.merge(df_mnth, how = 'left', on = 'c_mnth')
df = df.merge(df_year, how = 'left', on = 'c_year') 

In [5]:
# Average features
df['avg_day_per_mnth'] =  df.wday_ct/df.mnth_ct
df['avg_mnth_per_year'] =  df.mnth_ct/df.year_ct

# 2. Hour, weather, configurations counts

In [6]:
# Hour count
df_hour = pd.DataFrame(df.c_hour.value_counts())
df_hour.reset_index(level=0,inplace=True)
df_hour.rename(columns = {'c_hour':'hour_ct',  'index':'c_hour'},inplace=True)

# Weather count
df_wthr = pd.DataFrame(df.c_wthr.value_counts())
df_wthr.reset_index(level=0,inplace=True)
df_wthr.rename(columns = {'c_wthr':'wthr_ct',  'index':'c_wthr'},inplace=True)

# Configuration count
df_conf = pd.DataFrame(df.c_conf.value_counts())
df_conf.reset_index(level=0,inplace=True)
df_conf.rename(columns = {'c_conf':'conf_ct',  'index':'c_conf'},inplace=True)

# Road way configuration count
df_rcfg = pd.DataFrame(df.c_rcfg.value_counts())
df_rcfg.reset_index(level=0,inplace=True)
df_rcfg.rename(columns = {'c_rcfg':'rcfg_ct',  'index':'c_rcfg'},inplace=True)

In [7]:
# left join to the main dataframe
df = df.merge(df_hour, how = 'left', on = 'c_hour')
df = df.merge(df_wthr, how = 'left', on = 'c_wthr') 
df = df.merge(df_conf, how = 'left', on = 'c_conf') 
df = df.merge(df_rcfg, how = 'left', on = 'c_rcfg') 

In [8]:
# Average features
df['avg_hr_per_day'] =  df.hour_ct/df.wday_ct
df['avg_wthr_per_day'] =  df.wthr_ct/df.wday_ct
df['avg_conf_per_day'] =  df.conf_ct/df.wday_ct
df['avg_rcfg_per_day'] =  df.rcfg_ct/df.wday_ct

# 3. Surface, alignment and position counts

In [9]:
# Road surface count
df_rsur = pd.DataFrame(df.c_rsur.value_counts())
df_rsur.reset_index(level=0,inplace=True)
df_rsur.rename(columns = {'c_rsur':'rsur_ct',  'index':'c_rsur'},inplace=True)

# Road alignment count
df_raln = pd.DataFrame(df.c_raln.value_counts())
df_raln.reset_index(level=0,inplace=True)
df_raln.rename(columns = {'c_raln':'raln_ct',  'index':'c_raln'},inplace=True)

# Person position count
df_psn = pd.DataFrame(df.p_psn.value_counts())
df_psn.reset_index(level=0,inplace=True)
df_psn.rename(columns = {'p_psn':'psn_ct',  'index':'p_psn'},inplace=True)

In [10]:
# left join to the main dataframe
df = df.merge(df_rsur, how = 'left', on = 'c_rsur')
df = df.merge(df_raln, how = 'left', on = 'c_raln') 
df = df.merge(df_psn, how = 'left', on = 'p_psn') 

In [11]:
# Average features
df['avg_rsur_per_day'] =  df.rsur_ct/df.wday_ct
df['avg_raln_per_day'] =  df.raln_ct/df.wday_ct
df['avg_psn_per_day'] =  df.psn_ct/df.wday_ct

# 4. Treatment, vehicle types, traffic control counts

In [12]:
# Treatment required count
df_isev = pd.DataFrame(df.p_isev.value_counts())
df_isev.reset_index(level=0,inplace=True)
df_isev.rename(columns = {'p_isev':'isev_ct',  'index':'p_isev'},inplace=True)

# Vechicle type count
df_vtype = pd.DataFrame(df.v_type.value_counts())
df_vtype.reset_index(level=0,inplace=True)
df_vtype.rename(columns = {'v_type':'vtype_ct',  'index':'v_type'},inplace=True)

# Traffic control type
df_traf = pd.DataFrame(df.c_traf.value_counts())
df_traf.reset_index(level=0,inplace=True)
df_traf.rename(columns = {'c_traf':'traf_ct',  'index':'c_traf'},inplace=True)

In [13]:
# left join to the main dataframe
df = df.merge(df_isev, how = 'left', on = 'p_isev')
df = df.merge(df_vtype, how = 'left', on = 'v_type') 
df = df.merge(df_traf, how = 'left', on = 'c_traf') 

In [14]:
# Average features
df['avg_isev_per_day'] =  df.isev_ct/df.wday_ct
df['avg_vtype_per_day'] =  df.vtype_ct/df.wday_ct
df['avg_traf_per_day'] =  df.traf_ct/df.wday_ct

# Save as csv

In [15]:
df.to_csv('../data/feat_engr_data.csv', index= False)