In [26]:
import pandas as pd
import numpy as np
import os
import tqdm 
import logging
import matplotlib.pyplot as plt

# Data Preprocessing
1. Please modify the categorial variables into dummy variables (e.g., season, weathersit, month, hour, weekday)
2. Normalize continuous variables using z-score (mean=0,sd=1).
3. Exclude the useless features in your training and modeling.
4. Separate the training and validation data. Use the last 21 days’ data for
validation. Note that the target column is “cnt”. The other two “causal” and
“registered” could be overlooked and should not be used as variables in your
code. 

In [31]:
class DataPreProcess(object):
    SourceDataPath='bikeRidershipPredictionDataHour.csv'
    DataSaveFolder='DataPreProcessed'
    # The categorial variables
    CategoryVariables=['season','yr','mnth','hr','holiday','weekday','workingday','weathersit']
    # The continuous variables
    ContinuousVariables=['temp','atemp','hum','windspeed']
    # The target variable
    TargetVariable='cnt'
    def FileSystemMaker(self):
        # Create the folder for saving the preprocessed data
        if not os.path.exists(self.DataSaveFolder):
            os.makedirs(self.DataSaveFolder)
        if self.IfDummy:
            self.DummyFolder=os.path.join(self.DataSaveFolder,'Dummy')
            if not os.path.exists(self.DummyFolder):
                os.makedirs(self.DummyFolder)
        if self.IfNormalize:
            self.NormalizeFolder=os.path.join(self.DataSaveFolder,'Normalize')
            if not os.path.exists(self.NormalizeFolder):
                os.makedirs(self.NormalizeFolder)
        if self.IfSeparate:
            self.SeparateFolder=os.path.join(self.DataSaveFolder,'Separated')
            if not os.path.exists(self.SeparateFolder):
                os.makedirs(self.SeparateFolder)
    def DummyVariables(self):
        # Create the dummy variables for the categorial variables
        for i in tqdm.tqdm(self.CategoryVariables,desc='Creating Dummy Variables'):
            DummyData=pd.get_dummies(self.RawData[i],drop_first=False, prefix=i)
            DummyResult=pd.concat([self.RawData,DummyData],axis=1)
            # Remove the original categorial variables
            Columns=self.RawData.columns.tolist()
            # Save the dummy variables and the key variable
            Columns.remove('instant')
            DummyResult.drop(Columns,axis=1,inplace=True)
            DummyResult.to_csv(os.path.join(self.DummyFolder,i+'.csv'),index=False)
    def NormalizeVariables(self,target_mean=0,target_sd=1):
        # Normalize the continuous variables
        for i in tqdm.tqdm(self.ContinuousVariables,desc='Normalizing Variables'):
            Mean=self.RawData[i].mean()
            SD=self.RawData[i].std()
            NormalizedData=(self.RawData[i]-Mean)/SD
            NormalizedData=NormalizedData*target_sd+target_mean
            NormalizedData=pd.concat([self.RawData['instant'],NormalizedData],axis=1)
            NormalizedData.to_csv(os.path.join(self.NormalizeFolder,i+'.csv'),index=False)
    def __init__(self,IfCheckInfo=False,IfDummy=True,IfNormalize=True,IfExclude=True,IfSeparate=True):
        # Read the source data
        self.RawData=pd.read_csv(self.SourceDataPath)
        # Check the basic information of the source data
        if IfCheckInfo:
            self.RawData.info()
        # Set the parameters for preprocessing
        self.IfDummy=IfDummy
        self.IfNormalize=IfNormalize
        self.IfExclude=IfExclude
        self.IfSeparate=IfSeparate
        # Create the folder for saving the preprocessed data
        self.FileSystemMaker()
        if self.IfDummy:
            self.DummyVariables()
        if self.IfNormalize:
            self.NormalizeVariables()

In [34]:
# Check the basic information of the source data
RawData=DataPreProcess(IfCheckInfo=True,IfDummy=False,IfNormalize=False,IfExclude=False,IfSeparate=False).RawData

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     17379 non-null  int64  
 1   dteday      17379 non-null  object 
 2   season      17379 non-null  int64  
 3   yr          17379 non-null  int64  
 4   mnth        17379 non-null  int64  
 5   hr          17379 non-null  int64  
 6   holiday     17379 non-null  int64  
 7   weekday     17379 non-null  int64  
 8   workingday  17379 non-null  int64  
 9   weathersit  17379 non-null  int64  
 10  temp        17379 non-null  float64
 11  atemp       17379 non-null  float64
 12  hum         17379 non-null  float64
 13  windspeed   17379 non-null  float64
 14  casual      17379 non-null  int64  
 15  registered  17379 non-null  int64  
 16  cnt         17379 non-null  int64  
dtypes: float64(4), int64(12), object(1)
memory usage: 2.3+ MB


In [30]:
RawData.describe()

Unnamed: 0,instant,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
count,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0
mean,8690.0,2.50164,0.502561,6.537775,11.546752,0.02877,3.003683,0.682721,1.425283,0.496987,0.475775,0.627229,0.190098,35.676218,153.786869,189.463088
std,5017.0295,1.106918,0.500008,3.438776,6.914405,0.167165,2.005771,0.465431,0.639357,0.192556,0.17185,0.19293,0.12234,49.30503,151.357286,181.387599
min,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.02,0.0,0.0,0.0,0.0,0.0,1.0
25%,4345.5,2.0,0.0,4.0,6.0,0.0,1.0,0.0,1.0,0.34,0.3333,0.48,0.1045,4.0,34.0,40.0
50%,8690.0,3.0,1.0,7.0,12.0,0.0,3.0,1.0,1.0,0.5,0.4848,0.63,0.194,17.0,115.0,142.0
75%,13034.5,3.0,1.0,10.0,18.0,0.0,5.0,1.0,2.0,0.66,0.6212,0.78,0.2537,48.0,220.0,281.0
max,17379.0,4.0,1.0,12.0,23.0,1.0,6.0,1.0,4.0,1.0,1.0,1.0,0.8507,367.0,886.0,977.0
