In [2]:
# Imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler,OrdinalEncoder
from sklearn.decomposition import PCA
from sklearn import set_config
pd.set_option('display.max_columns',200)
pd.set_option("display.max_info_rows", 800)
pd.set_option('display.max_info_columns',800)
set_config(transform_output='pandas')

In [3]:
df = pd.read_csv("Data/bikeshare_train - bikeshare_train.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Dtype  
---  ------      -----  
 0   datetime    object 
 1   season      int64  
 2   holiday     int64  
 3   workingday  int64  
 4   weather     int64  
 5   temp        float64
 6   atemp       float64
 7   humidity    int64  
 8   windspeed   float64
 9   casual      int64  
 10  registered  int64  
 11  count       int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.7+ KB


In [4]:
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 0:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 1:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 2:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 3:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 4:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [5]:
# Dropping redundant columns
df = df.drop(columns = ['casual','registered'])
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
0,2011-01-01 0:00:00,1,0,0,1,9.84,14.395,81,0.0,16
1,2011-01-01 1:00:00,1,0,0,1,9.02,13.635,80,0.0,40
2,2011-01-01 2:00:00,1,0,0,1,9.02,13.635,80,0.0,32
3,2011-01-01 3:00:00,1,0,0,1,9.84,14.395,75,0.0,13
4,2011-01-01 4:00:00,1,0,0,1,9.84,14.395,75,0.0,1


In [6]:
# Defining 'datetime' as datetime
df['datetime'] = pd.to_datetime(df['datetime'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 10 columns):
 #   Column      Dtype         
---  ------      -----         
 0   datetime    datetime64[ns]
 1   season      int64         
 2   holiday     int64         
 3   workingday  int64         
 4   weather     int64         
 5   temp        float64       
 6   atemp       float64       
 7   humidity    int64         
 8   windspeed   float64       
 9   count       int64         
dtypes: datetime64[ns](1), float64(3), int64(6)
memory usage: 850.6 KB


In [9]:
# From datetime we'll extract three columns for month name, name of day of the week, and hour of the day
df['month'] = df['datetime'].dt.month_name()
df['day'] = df['datetime'].dt.day_name()
df['hour'] = df['datetime'].dt.hour
# We also need them all to be object dtypes to one hot encode later
df['month'] = df['month'].astype('string')
df['day'] = df['day'].astype('string')
df['hour'] = df['hour'].astype('string')
df.head(3)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,month,day,hour
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,16,January,Saturday,0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,40,January,Saturday,1
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,32,January,Saturday,2


In [10]:
# Now datetime and season are not needed. Dropping.
df = df.drop(columns = ["datetime","season"])
df.head(3)

Unnamed: 0,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,month,day,hour
0,0,0,1,9.84,14.395,81,0.0,16,January,Saturday,0
1,0,0,1,9.02,13.635,80,0.0,40,January,Saturday,1
2,0,0,1,9.02,13.635,80,0.0,32,January,Saturday,2


In [11]:
# The 'temp' & 'atemp' features have celsius measurements when we want fahrenheit.
# Using lambda to convert
temps = ['temp','atemp']
def c_to_f(temps):
    for i in temps:
        f = col*1.8 + 32
        return f

In [15]:
df['temp'] = df['temp'].apply(lambda x: x*1.8 +32)

In [16]:
df['temp'].head()

0    49.712
1    48.236
2    48.236
3    49.712
4    49.712
Name: temp, dtype: float64

In [17]:
df['atemp'] = df['atemp'].apply(lambda x: x*1.8+32)
df['atemp'].head()

0    57.911
1    56.543
2    56.543
3    57.911
4    57.911
Name: atemp, dtype: float64

In [19]:
# With conversions made, we can make a temperature variance feature to show temperature difference from the average
df['temp_variance'] = df['temp'] - df['atemp']
df['temp_variance']

0       -8.199
1       -8.307
2       -8.307
3       -8.199
4       -8.199
         ...  
10881   -7.407
10882   -4.797
10883   -3.546
10884   -6.273
10885   -6.381
Name: temp_variance, Length: 10886, dtype: float64

In [20]:
# Dropping 'atemp'
df = df.drop(columns="atemp")
df.head(1)

Unnamed: 0,holiday,workingday,weather,temp,humidity,windspeed,count,month,day,hour,temp_variance
0,0,0,1,49.712,81,0.0,16,January,Saturday,0,-8.199
