# **Creating the Machine Learning or Deep Learning Model**

Import packages - Will be focusing on the Pytorch implementation for further customizability in model architecture and training

In [1]:
# Pytorch
import torch                        # for all things Pytorch
import torch.nn as nn               # for torch.nn Module, the Parent object for Pytorch models
import torch.nn.functional as F     # for the activation function
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset, random_split
print(f"Torch version: {torch.__version__}")
# Check CUDA availability
print(f"Is there CUDA available = {torch.cuda.is_available()}")
gpu_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Numpy
import numpy as np
print(f"NumPy version: {np.__version__}")

# Matplotlib
import matplotlib as mpl
import matplotlib.pyplot as plt
print(f"Matplotlib version: {mpl.__version__}")

# Pandas
import pandas as pd
print(f"Pandas version: {pd.__version__}")

# Scikit-Learn
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso
print(f"Scikit-learn version: {sk.__version__}")


Torch version: 2.4.1+cu118
Is there CUDA available = True
NumPy version: 1.26.4
Matplotlib version: 3.9.2
Pandas version: 2.2.3
Scikit-learn version: 1.5.2


Load in Data

In [2]:
data = pd.read_csv("SeoulBikeData.csv", encoding='ISO-8859-1')
print(f"Dataframe Columns: {data.columns.values}")
data.head(2)

Dataframe Columns: ['Date' 'Rented Bike Count' 'Hour' 'Temperature(C)' 'Humidity(%)'
 'Wind speed (m/s)' 'Visibility (10m)' 'Dew point temperature(ï¿½C)'
 'Solar Radiation (MJ/m2)' 'Rainfall(mm)' 'Snowfall (cm)' 'Seasons'
 'Holiday' 'Functioning Day']


Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(ï¿½C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes


# Model Creation
Goal: Create a Model that is able to predict the total number of bike rentals in Seoul given 24 hours, which results in 24 different predictions where we kinda model the curve that reprsents Rentals per hour.
Additionally, we might consider an LSTM network to consider past hour rentals. Additionally, we have to answer the question if the data is dependent on previous hours or not.

Additional Models:
- One model for predicting the amount of bikes for a specific HOUR
- One model for predicting the amount of bikes for a specific DAY


Considerations:

- *LSTM Network possibly* ------ MAIN GOAL for us to predict the curve of bike rentals from a week of data beforehand
- Deep Regression Netowrk(MLP) for Regression
- Linear Regression ML Model

Most Important Features Ranked 1(Most Important) - 3(Least Important): These are Assumptions
- Rented Bike Count (TARGET VARIABLE)
- Hour                      1
- Temperature               1
- Humidity                  2
- Wind Speed                3
- Visibility                3
- Dew Point Temperature     1
- Solar Radiation           1
- Rainfall                  1
- Snowfall                  1
- Seasons                   3
- Holiday                   3
- Functioning Day           2

# *Preprocess and set-up data*

One Hot encoding the seasons, and binary encoding the holiday and functional day

THE HOURLY BASED DATAFRAME

In [3]:
########## RUN ONLY ONCE ######################

# One-hot encode column 'SEASON'
one_hot_encoded = pd.get_dummies(data['Seasons'], prefix='Season', drop_first=False)
print(data['Seasons'].unique())
# Concatenate the original DataFrame with the one-hot encoded columns of SEASONS
data = pd.concat([data, one_hot_encoded], axis=1)

# Binary encoding of the Functioning day YES or NO
data['Functioning Day'] = (data['Functioning Day'] == 'Yes').astype(int)

# Binary encoding of the Holiday day NO HOLIDAY or HOLIDAY
data['Holiday'] = (data['Holiday'] == 'No Holiday').astype(int)

['Winter' 'Spring' 'Summer' 'Autumn']


In [4]:
# COLUMNS TO CONVERT TO BINARY: 0 or 1
bool_cols = ['Season_Autumn', 'Season_Spring','Season_Summer', 'Season_Winter']

# Actually changing to Binary
data[bool_cols] = data[bool_cols].astype(int)

data.head(2)

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(ï¿½C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day,Season_Autumn,Season_Spring,Season_Summer,Season_Winter
0,01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,1,1,0,0,0,1
1,01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,1,1,0,0,0,1


Create Features and Target units

In [10]:
X = data.drop(columns=['Rented Bike Count', 'Date', 'Seasons'])
y = data['Rented Bike Count']

Setting up Threshold Value

In [11]:
threshold = 350

Create Dataset Loader Class

In [12]:
class BikeRentalData(Dataset):
    def __init__(self, features, targets):
        self.features = torch.tensor(features.values, dtype=torch.float32)
        self.targets = torch.tensor(targets.values, dtype=torch.float32)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]
    
# Create Dataset and DataLoader
dataset = BikeRentalData(X, y)

# Define split sizes
train_size = int(0.7 * len(dataset))  # 70% for training
valid_size = int(0.15 * len(dataset))  # 15% for validation
test_size = len(dataset) - train_size - valid_size  # 15% for testing

# Split the dataset
train_dataset, valid_dataset, test_dataset = random_split(dataset, [train_size, valid_size, test_size])

# Create DataLoaders for each dataset
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=False) # Want consistent results, not random
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False) # Want consistent results, not random

Create Training Loop

In [13]:
def train_model(model, dataloader, criterion, optimizer, num_epochs=35, device='cuda'):
    model.to(device)  # Move model to the specified device
    model.train()  # Set the model to training mode

    for epoch in range(num_epochs):
        running_loss = 0.0
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)  # Move data to the specified device

            # Zero the gradients and prevent accumulation of gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            # Backward pass
            loss.backward()
            optimizer.step()

            # Accumulate loss
            running_loss += loss.item()

        # Print average loss for this epoch
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(dataloader):.4f}")
    
# Validation (optional)
def validate_model(model, dataloader, criterion, device='cuda'):
    model.eval()  # Set the model to evaluation mode

    total_loss = 0.0
    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)  # Move data to the specified device
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            total_loss += loss.item()
    print(f'Validation Loss: {total_loss / len(dataloader):.4f}')

# Testing (optional)
def test_model(model, dataloader, device='cuda'):
    model.eval()  # Set the model to evaluation mode
    all_outputs = []  # List to store outputs
    all_targets = []  # List to store targets

    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs = inputs.to(device)  # Move data to the specified device
            outputs = model(inputs)

            # Store outputs and targets
            all_outputs.append(outputs.cpu())  # Move outputs to CPU and append
            all_targets.append(targets.cpu())  # Move targets to CPU and append

            print(f'Inputs: {inputs.cpu().numpy()}, Predicted: {outputs.cpu().numpy()}, Targets: {targets.cpu().numpy()}')

    # Concatenate all outputs and targets
    all_outputs = torch.cat(all_outputs).numpy()  # Convert to NumPy array
    all_targets = torch.cat(all_targets).numpy()  # Convert to NumPy array

    # Here you can add code to evaluate your model's predictions

    print(f"Accuracy: {np.mean(np.abs(all_outputs - all_targets) <= threshold)}")


# *Deep Neural Network Regression Model Architecture*


Flow:

    - Train on many hours worth of data(Amount of bike rentals) given inputs like hour

    - Test how regression works when given a certain hour

In [14]:
BATCH_SIZE = 32
DIM_IN = 15
DIM_OUT = 1

class RentalDNNRegression(nn.Module):
    
    def __init__(self):
        super(RentalDNNRegression, self).__init__()
        # Creating Layers
        self.lin1 = nn.Linear(DIM_IN, 512)
        self.lin2 = nn.Linear(512, 256)
        self.lin3 = nn.Linear(256, 128)
        self.lin4 = nn.Linear(128, 128)
        self.lin5 = nn.Linear(128,32)
        self.out = nn.Linear(32, DIM_OUT)
        
    def forward(self, x):
        # Calculation Portion
        x = F.relu(self.lin1(x))
        x = F.relu(self.lin2(x))
        x = F.relu(self.lin3(x))
        x = F.relu(self.lin4(x))
        x = F.relu(self.lin5(x))
        x = self.out(x)  # Call the final layer

        return x
    
# Instantiating Model "object"
rentalMLPReg = RentalDNNRegression()

# Moving model to GPU memory if available
gpu_device = 'cuda' if torch.cuda.is_available() else 'cpu'
rentalMLPReg.to(gpu_device)

# Loss Function Criterion
criterion = nn.MSELoss()

# Optimizer Instantiation
optimizer = optim.Adam(params=rentalMLPReg.parameters(), lr=0.01)

print(rentalMLPReg)

RentalDNNRegression(
  (lin1): Linear(in_features=15, out_features=512, bias=True)
  (lin2): Linear(in_features=512, out_features=256, bias=True)
  (lin3): Linear(in_features=256, out_features=128, bias=True)
  (lin4): Linear(in_features=128, out_features=128, bias=True)
  (lin5): Linear(in_features=128, out_features=32, bias=True)
  (out): Linear(in_features=32, out_features=1, bias=True)
)


Train the Model??

In [15]:
train_model(rentalMLPReg, train_loader, criterion, optimizer, num_epochs=25, device=gpu_device)

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [1/25], Loss: 479292.4312
Epoch [2/25], Loss: 440222.6045
Epoch [3/25], Loss: 438913.3854
Epoch [4/25], Loss: 435729.9003
Epoch [5/25], Loss: 434929.6739
Epoch [6/25], Loss: 432471.7427
Epoch [7/25], Loss: 429937.4566
Epoch [8/25], Loss: 427467.1617
Epoch [9/25], Loss: 426996.8974
Epoch [10/25], Loss: 428687.2632
Epoch [11/25], Loss: 426806.7814
Epoch [12/25], Loss: 429277.1354
Epoch [13/25], Loss: 423633.3737
Epoch [14/25], Loss: 428793.3584
Epoch [15/25], Loss: 424091.4339
Epoch [16/25], Loss: 428224.6099
Epoch [17/25], Loss: 427780.9224
Epoch [18/25], Loss: 424347.9705
Epoch [19/25], Loss: 429419.3119
Epoch [20/25], Loss: 422919.9510
Epoch [21/25], Loss: 423049.1474
Epoch [22/25], Loss: 427023.4978
Epoch [23/25], Loss: 425540.0646
Epoch [24/25], Loss: 422921.7491
Epoch [25/25], Loss: 425712.5195


In [16]:
test_model(rentalMLPReg, test_loader, device=gpu_device)
validate_model(rentalMLPReg, valid_loader, criterion, device=gpu_device)

Inputs: [[ 1.700e+01  2.500e+01  4.400e+01  1.300e+00  1.883e+03  1.180e+01
   3.600e-01  0.000e+00  0.000e+00  1.000e+00  0.000e+00  1.000e+00
   0.000e+00  0.000e+00  0.000e+00]
 [ 2.100e+01  2.530e+01  5.500e+01  1.400e+00  2.000e+03  1.560e+01
   0.000e+00  0.000e+00  0.000e+00  1.000e+00  1.000e+00  0.000e+00
   0.000e+00  1.000e+00  0.000e+00]
 [ 1.400e+01 -7.000e-01  3.900e+01  8.000e-01  8.260e+02 -1.290e+01
   3.500e-01  0.000e+00  0.000e+00  1.000e+00  1.000e+00  0.000e+00
   0.000e+00  0.000e+00  1.000e+00]
 [ 1.500e+01  7.600e+00  4.100e+01  2.300e+00  1.953e+03 -4.800e+00
   7.400e-01  0.000e+00  0.000e+00  1.000e+00  1.000e+00  0.000e+00
   0.000e+00  0.000e+00  1.000e+00]
 [ 8.000e+00 -1.140e+01  4.700e+01  3.300e+00  2.000e+03 -2.040e+01
   0.000e+00  0.000e+00  0.000e+00  1.000e+00  1.000e+00  0.000e+00
   0.000e+00  0.000e+00  1.000e+00]
 [ 1.100e+01  2.020e+01  4.500e+01  1.100e+00  1.894e+03  7.800e+00
   2.330e+00  0.000e+00  0.000e+00  1.000e+00  1.000e+00  0.000e

  return F.mse_loss(input, target, reduction=self.reduction)


# *Linear Regression Model Architecture*

Creating Threshold Value for margin of error in Predictions

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
lin_reg = LinearRegression()

lin_reg.fit(X_train, y_train)

y_pred_linreg = lin_reg.predict(X_test)

matchingAvgLinreg = np.mean(np.abs(y_pred_linreg - y_test) <= threshold)

print(f"Average accuracy with a threshold value for LINEAR REGRESSION: {matchingAvgLinreg}")

lasso = Lasso()

lasso.fit(X_train, y_train)

y_pred_lasso = lasso.predict(X_test)

matchingAvgLasso = np.mean(np.abs(y_pred_lasso - y_test) <= threshold)

print(f"Average accuracy with a threshold value for LASSO REGRESSION: {matchingAvgLasso}")

Can't capture the complexities of the features obviously as many factors can cause shift in bike demand such as previous hour's bike demand.

# *LSTM Deep Neural Network Architecture*