# Seol Bike Sharing Demand - Prepare Data for Modeling

### Contents
1. Load Data
2. Feature Transformation
3. Save Data

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

## 1. Load Data

In [13]:
df_bike = pd.read_pickle("../input/processed/df_bike.pkl")
df_bike.head()

Unnamed: 0,date,rented_bike_count,hour,temperature,humidity,wind_speed,visibility,dew_point_temperature,solar_radiation,rainfall,...,holiday,functioning_day,year,month,day,dayofweek,day_of_week,day_of_year,week_of_year,date_index
0,2017-12-01,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,...,No Holiday,Yes,2017,12,1,4,4,335,48,0
1,2017-12-01,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,...,No Holiday,Yes,2017,12,1,4,4,335,48,0
2,2017-12-01,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,...,No Holiday,Yes,2017,12,1,4,4,335,48,0
3,2017-12-01,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,...,No Holiday,Yes,2017,12,1,4,4,335,48,0
4,2017-12-01,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,...,No Holiday,Yes,2017,12,1,4,4,335,48,0


In [14]:
df_bike.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   date                   8760 non-null   datetime64[ns]
 1   rented_bike_count      8760 non-null   int64         
 2   hour                   8760 non-null   int64         
 3   temperature            8760 non-null   float64       
 4   humidity               8760 non-null   int64         
 5   wind_speed             8760 non-null   float64       
 6   visibility             8760 non-null   int64         
 7   dew_point_temperature  8760 non-null   float64       
 8   solar_radiation        8760 non-null   float64       
 9   rainfall               8760 non-null   float64       
 10  snowfall               8760 non-null   float64       
 11  seasons                8760 non-null   category      
 12  holiday                8760 non-null   category      
 13  fun

## 2. Feature Transformation

In [15]:
# Let's conver 'hour' to categorical variable
df_bike['hour'] = df_bike['hour'].astype('category')

In [16]:
# Let's create a function to encode categorical variables using OneHotEncoder from sklearn
def encode_categorical(df, cols):
    """Encode categorical variables using OneHotEncoder from sklearn"""
    for col in cols:
        # create array of dummies
        data_dummies = pd.get_dummies(df[col], prefix=col.lower().replace(" ", "_")).astype(np.uint8)
        # remove original column from df
        df = df.drop(col, axis=1)
        # join the new df of dummy variables
        df = pd.concat([df, data_dummies], axis=1)
    return df

In [17]:
# get all categorical variables
cat_cols = df_bike.select_dtypes(['category']).columns
# encode categorical variables
df_bike_encoded = encode_categorical(df_bike, cat_cols)

In [20]:
df_bike_encoded.head()

Unnamed: 0,date,rented_bike_count,temperature,humidity,wind_speed,visibility,dew_point_temperature,solar_radiation,rainfall,snowfall,...,hour_22,hour_23,seasons_Autumn,seasons_Spring,seasons_Summer,seasons_Winter,holiday_Holiday,holiday_No Holiday,functioning_day_No,functioning_day_Yes
0,2017-12-01,254,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,...,0,0,0,0,0,1,0,1,0,1
1,2017-12-01,204,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,...,0,0,0,0,0,1,0,1,0,1
2,2017-12-01,173,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,...,0,0,0,0,0,1,0,1,0,1
3,2017-12-01,107,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,...,0,0,0,0,0,1,0,1,0,1
4,2017-12-01,78,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,...,0,0,0,0,0,1,0,1,0,1


In [25]:
# Let's drop the date column, since we have already extracted the time features
df_bike_encoded = df_bike_encoded.drop('date', axis=1)

In [26]:
df_bike_encoded.shape

(8760, 49)

## 3. Save Data

In [27]:
df_bike_encoded.to_pickle("../input/processed/df_bike_encoded.pkl")