# Feature Engineering (Core)

*Christina Brockway*

## Task:
To engineer some new features to try to improve a model's ability to predict the total number of bike share rentals during a given hour of the day.


In [1]:
# Imports
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


### Import the data

In [3]:
df = pd.read_csv('data/bikeshare_train - bikeshare_train.csv')
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 0:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 1:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 2:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 3:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 4:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


### Drop the 'casual' and 'registered' columns. 
These are redundant with the target, 'count'.

In [4]:
df=df.drop(columns= ['casual', 'registered'])
df.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
0,2011-01-01 0:00:00,1,0,0,1,9.84,14.395,81,0.0,16
1,2011-01-01 1:00:00,1,0,0,1,9.02,13.635,80,0.0,40


### Transform the 'datetime' column into a datetime type 

In [5]:
df['datetime']= pd.to_datetime(df['datetime'])

df.index

RangeIndex(start=0, stop=10886, step=1)

In [6]:
df.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,40


#### Create 3 new columns in the data frame containing the:
- Name of the Month
- Name of the Day of the Week
- Hour of the Day

In [8]:
df['Month']= df['datetime'].dt.month_name()
df['Day'] = df['datetime'].dt.day_name()
df['Hour'] = df['datetime'].dt.hour

df.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,Month,Day,Hour
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,16,January,Saturday,0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,40,January,Saturday,1


#### Make sure all 3 new columns are 'object' datatype so they can be one-hot encoded later.

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   datetime    10886 non-null  datetime64[ns]
 1   season      10886 non-null  int64         
 2   holiday     10886 non-null  int64         
 3   workingday  10886 non-null  int64         
 4   weather     10886 non-null  int64         
 5   temp        10886 non-null  float64       
 6   atemp       10886 non-null  float64       
 7   humidity    10886 non-null  int64         
 8   windspeed   10886 non-null  float64       
 9   count       10886 non-null  int64         
 10  Month       10886 non-null  object        
 11  Day         10886 non-null  object        
 12  Hour        10886 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(7), object(2)
memory usage: 1.1+ MB


In [10]:
df['Hour']=df['Hour'].astype('object')
df['Hour'].dtype

dtype('O')

#### Drop the 'datetime' and 'season' columns. These are now redundant.

In [12]:
drop_cols = ['datetime', 'season']
df=df.drop(columns=drop_cols)
df.head()

Unnamed: 0,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,Month,Day,Hour
0,0,0,1,9.84,14.395,81,0.0,16,January,Saturday,0
1,0,0,1,9.02,13.635,80,0.0,40,January,Saturday,1
2,0,0,1,9.02,13.635,80,0.0,32,January,Saturday,2
3,0,0,1,9.84,14.395,75,0.0,13,January,Saturday,3
4,0,0,1,9.84,14.395,75,0.0,1,January,Saturday,4


####  Use `.apply()` and a Lambda function
The temperatures in the 'temp' and 'atemp' columns are in Celsius, convert them to Fahrenheit.

In [17]:
df['temp']=df['temp'].apply(lambda c: c*9/5+32)
df['temp']

0        49.712
1        48.236
2        48.236
3        49.712
4        49.712
          ...  
10881    60.044
10882    58.568
10883    57.092
10884    57.092
10885    55.616
Name: temp, Length: 10886, dtype: float64

In [18]:
df['atemp']=df['atemp'].apply(lambda c: c*9/5+32)
df['atemp']

0        57.911
1        56.543
2        56.543
3        57.911
4        57.911
          ...  
10881    67.451
10882    63.365
10883    60.638
10884    63.365
10885    61.997
Name: atemp, Length: 10886, dtype: float64

#### Create a new column, 'temp_variance,'

* 'temp_variance' shows how much warmer or colder the current temperature ('temp') is than the average temperature('atemp') for that day of the year. 

*  If the current temperature is warmer than average ('atemp'), the value in 'temp_variance' should be positive.

In [20]:
df['temp_variance']=(df['temp']-df['atemp'])
df['temp_variance']

0       -8.199
1       -8.307
2       -8.307
3       -8.199
4       -8.199
         ...  
10881   -7.407
10882   -4.797
10883   -3.546
10884   -6.273
10885   -6.381
Name: temp_variance, Length: 10886, dtype: float64

#### Drop the 'atemp' column.

In [21]:
df=df.drop(columns='atemp')

In [22]:
df.head(2)

Unnamed: 0,holiday,workingday,weather,temp,humidity,windspeed,count,Month,Day,Hour,temp_variance
0,0,0,1,49.712,81,0.0,16,January,Saturday,0,-8.199
1,0,0,1,48.236,80,0.0,40,January,Saturday,1,-8.307
