# Feature Engineering
In this notebook, we will engineer count based features based on existing features. We note that the number of features which can be engineered through count is very large

In [12]:
# Ignore deprecated warning
import warnings
warnings.filterwarnings("ignore")

# Data manipulation
%matplotlib inline
import pandas as pd
import numpy as np
import time

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Set font scale and style
plt.rcParams.update({'font.size': 15})

In [13]:
df = pd.read_csv('../data/clean_data.csv') #  Load  data
print('Data size',df.shape)
df.head()

Data size (89990, 19)


Unnamed: 0,collision_id,location,x,y,longitude,latitude,environment,light,surface_condition,traffic_control,traffic_control_condition,class,impact_type,no_of_pedestrians,fid,c_year,c_month,c_day,c_hour
0,13-704,ROBERTSON RD btwn MOODIE DR & VANIER RD,357144.875,5020503.5,-75.832298,45.322533,01 - Clear,07 - Dark,01 - Dry,10 - No control,,Non-fatal injury,02 - Angle,0.0,1,2013,January,Friday,23
1,13-806,FALLOWFIELD RD btwn EAGLESON RD & RICHMOND RD,356860.1875,5013034.5,-75.836708,45.255344,05 - Drifting Snow,01 - Daylight,06 - Ice,10 - No control,,P.D. only,01 - Approaching,0.0,2,2013,January,Sunday,18
2,13-101,FIFTH AVE @ RUPERT ST,368589.71875,5029516.5,-75.685158,45.402679,03 - Snow,01 - Daylight,06 - Ice,02 - Stop sign,01 - Functioning,P.D. only,02 - Angle,0.0,3,2013,January,Thursday,21
3,13-502,BIRCH AVE @ FARNHAM CRES,370292.90625,5035187.0,-75.662651,45.453541,01 - Clear,01 - Daylight,02 - Wet,02 - Stop sign,01 - Functioning,P.D. only,02 - Angle,0.0,4,2013,January,Saturday,21
4,13-503,ALESTHER ST btwn QUEEN MARY ST & PRINCE ALBERT ST,372133.1875,5032130.5,-75.639542,45.425865,02 - Rain,07 - Dark,02 - Wet,10 - No control,,P.D. only,06 - SMV unattended vehicle,0.0,5,2013,January,Friday,4


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89990 entries, 0 to 89989
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   collision_id               89990 non-null  object 
 1   location                   89990 non-null  object 
 2   x                          89990 non-null  float64
 3   y                          89990 non-null  float64
 4   longitude                  89990 non-null  float64
 5   latitude                   89990 non-null  float64
 6   environment                89989 non-null  object 
 7   light                      89990 non-null  object 
 8   surface_condition          89990 non-null  object 
 9   traffic_control            89954 non-null  object 
 10  traffic_control_condition  46084 non-null  object 
 11  class                      89990 non-null  object 
 12  impact_type                89990 non-null  object 
 13  no_of_pedestrians          89990 non-null  flo

# 1. Day, month, and year counts

In [15]:
# Day count
df_day = pd.DataFrame(df.c_day.value_counts())
df_day.reset_index(level=0,inplace=True)
df_day.rename(columns = {'c_day':'day_ct',  'index':'c_day'},inplace=True)

# Month count
df_mnth = pd.DataFrame(df.c_month.value_counts())
df_mnth.reset_index(level=0,inplace=True)
df_mnth.rename(columns = {'c_month':'month_ct',  'index':'c_month'},inplace=True)

# Year count
df_year = pd.DataFrame(df.c_year.value_counts())
df_year.reset_index(level=0,inplace=True)
df_year.rename(columns = {'c_year':'year_ct',  'index':'c_year'},inplace=True)

In [16]:
# left join to the main dataframe
df = df.merge(df_day, how = 'left', on = 'c_day') 
df = df.merge(df_mnth, how = 'left', on = 'c_month')
df = df.merge(df_year, how = 'left', on = 'c_year') 

In [17]:
# Average features
df['avg_day_per_mnth'] =  df.day_ct/df.month_ct
df['avg_mnth_per_year'] =  df.month_ct/df.year_ct

# 2. Hour, impact type, environment counts

In [18]:
# Hour count
df_hour = pd.DataFrame(df.c_hour.value_counts())
df_hour.reset_index(level=0,inplace=True)
df_hour.rename(columns = {'c_hour':'hour_ct',  'index':'c_hour'},inplace=True)

# Impact type count
df_impact = pd.DataFrame(df.impact_type.value_counts())
df_impact.reset_index(level=0,inplace=True)
df_impact.rename(columns = {'impact_type':'impact_type_ct',  'index':'impact_type'},inplace=True)

# Environment count
df_env = pd.DataFrame(df.environment.value_counts())
df_env.reset_index(level=0,inplace=True)
df_env.rename(columns = {'environment':'envmt_ct',  'index':'environment'},inplace=True)

# Surface condition count
df_scond = pd.DataFrame(df.surface_condition.value_counts())
df_scond.reset_index(level=0,inplace=True)
df_scond.rename(columns = {'surface_condition':'scond_ct',  'index':'surface_condition'},inplace=True)

In [19]:
# left join to the main dataframe
df = df.merge(df_hour, how = 'left', on = 'c_hour')
df = df.merge(df_impact, how = 'left', on = 'impact_type') 
df = df.merge(df_env, how = 'left', on = 'environment') 
df = df.merge(df_scond, how = 'left', on = 'surface_condition') 

In [20]:
# Average features
df['avg_hr_per_day'] =  df.hour_ct/df.day_ct

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 89990 entries, 0 to 89989
Data columns (total 29 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   collision_id               89990 non-null  object 
 1   location                   89990 non-null  object 
 2   x                          89990 non-null  float64
 3   y                          89990 non-null  float64
 4   longitude                  89990 non-null  float64
 5   latitude                   89990 non-null  float64
 6   environment                89989 non-null  object 
 7   light                      89990 non-null  object 
 8   surface_condition          89990 non-null  object 
 9   traffic_control            89954 non-null  object 
 10  traffic_control_condition  46084 non-null  object 
 11  class                      89990 non-null  object 
 12  impact_type                89990 non-null  object 
 13  no_of_pedestrians          89990 non-null  flo

# Save as csv

In [22]:
df.to_csv('../data/feat_engr_data.csv', index= False)