In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/bike-sharing-demand/sampleSubmission.csv
/kaggle/input/bike-sharing-demand/train.csv
/kaggle/input/bike-sharing-demand/test.csv


In [2]:
data_path = '/kaggle/input/bike-sharing-demand/'

train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')
submission = pd.read_csv(data_path + 'sampleSubmission.csv')

Tutorial Link -> https://www.kaggle.com/code/werooring/ch6-baseline

## Feature Engineering

**Outlier Removal**

In [3]:
# Extract data if weather != 4 from training data
# Weather 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog -> not interested 
train = train[train['weather'] != 4]

**Combine Data**

- pd.concat()'s parameter: ignore_index (boolean)
    - Without ignore_index=True, the original indices of train and test are preserved.
    - With ignore_index=True, the indices are reset, and the new DataFrame has a continuous index starting from 0.

In [4]:
all_data_temp = pd.concat([train, test])
all_data_temp

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,3.0,13.0,16.0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,8.0,32.0,40.0
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,5.0,27.0,32.0
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,3.0,10.0,13.0
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
6488,2012-12-31 19:00:00,1,0,1,2,10.66,12.880,60,11.0014,,,
6489,2012-12-31 20:00:00,1,0,1,2,10.66,12.880,60,11.0014,,,
6490,2012-12-31 21:00:00,1,0,1,1,10.66,12.880,60,11.0014,,,
6491,2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981,,,


In [5]:
all_data = pd.concat([train, test], ignore_index = True)
all_data

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,3.0,13.0,16.0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,8.0,32.0,40.0
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,5.0,27.0,32.0
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,3.0,10.0,13.0
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
17373,2012-12-31 19:00:00,1,0,1,2,10.66,12.880,60,11.0014,,,
17374,2012-12-31 20:00:00,1,0,1,2,10.66,12.880,60,11.0014,,,
17375,2012-12-31 21:00:00,1,0,1,1,10.66,12.880,60,11.0014,,,
17376,2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981,,,


**Add sub features**

In [6]:
from datetime import datetime

# date feature
all_data['date'] = all_data['datetime'].apply(lambda x: x.split()[0])
# year feature
all_data['year'] = all_data['datetime'].apply(lambda x: x.split()[0].split("-")[0])
# month feature
all_data['month'] = all_data['datetime'].apply(lambda x: x.split()[0].split('-')[1])
# hour feature 
all_data['hour'] = all_data['datetime'].apply(lambda x: x.split()[1].split(':')[0])
# weekday feature
all_data["weekday"] = all_data['date'].apply(lambda dateString : datetime.strptime(dateString,"%Y-%m-%d").weekday())

**Delete unnecessary features**

- axis 0 or 1
    - axis = 0: the operation is applied down the rows (vertically), meaning it operates on columns
    - axis = 1: the operation is applied across the columns (horizontally), meaning it operates on rows

In [7]:
drop_features = ['casual', 'registered', 'datetime', 'date', 'month', 'windspeed']

all_data = all_data.drop(drop_features, axis=1)

**Split Data**

In [8]:
# split training and test data
X_train = all_data[~pd.isnull(all_data['count'])]
X_test = all_data[pd.isnull(all_data['count'])]

# remove target value's 'count' feature
X_train = X_train.drop(['count'], axis=1)
X_test = X_test.drop(['count'], axis=1)

y = train['count'] #target val

In [9]:
X_train.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,year,hour,weekday
0,1,0,0,1,9.84,14.395,81,2011,0,5
1,1,0,0,1,9.02,13.635,80,2011,1,5
2,1,0,0,1,9.02,13.635,80,2011,2,5
3,1,0,0,1,9.84,14.395,75,2011,3,5
4,1,0,0,1,9.84,14.395,75,2011,4,5


## Write function to calculate evaluation metrics

In [10]:
import numpy as np

def rmsle(y_true, y_pred, convertExp=True):
    # convert exponent
    if convertExp:
        y_true = np.exp(y_true)
        y_pred = np.exp(y_pred)
        
    # transit pred to 0 after transition to log
    log_true = np.nan_to_num(np.log(y_true+1))
    log_pred = np.nan_to_num(np.log(y_pred+1))
    
    # calcuate RMSLE
    output = np.sqrt(np.mean((log_true - log_pred)**2))
    return output

## Train Model

In [11]:
from sklearn.linear_model import LinearRegression

linear_reg_model = LinearRegression()

In [12]:
log_y = np.log(y) 
linear_reg_model.fit(X_train, log_y) # train model

## Evaluate Model Performance

In [13]:
preds = linear_reg_model.predict(X_train)

In [14]:
print (f'RMSLE val of linear regression : {rmsle(log_y, preds, True):.4f}')

RMSLE val of linear regression : 1.0205


## Submit Result

In [15]:
linearreg_preds = linear_reg_model.predict(X_test) # pred w/ test data

submission['count'] = np.exp(linearreg_preds)    # exponential transformation
submission.to_csv('submission.csv', index=False) # store to file