<a href="https://colab.research.google.com/github/Sebbemars/MSCI546Project/blob/main/546_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Install Pytorch Lightning**

In [None]:
!pip install pytorch_lightning matplotlib

# **Import Packages**

In [None]:
import numpy as np
import pandas as pd 
from pandas import DataFrame
import matplotlib.pyplot as plt
import argparse
import datetime
import os
import random
import time
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
#from pytorch_lightning.profiler import SimpleProfiler
from pytorch_lightning.loggers import TensorBoardLogger
import torch.nn as nn
import torch
import pytorch_lightning as pl
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Subset
from torch.utils.data.sampler import SubsetRandomSampler

In [None]:
#Create RMSLE

class RMSLELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        
    def forward(self, pred, actual):
        return torch.sqrt(self.mse(torch.log(pred + 1), torch.log(actual + 1)))

# **Read Data**

In [None]:
from google.colab import drive
drive.mount('/content/drive')
path = 'drive/MyDrive/output.csv'

In [None]:
raw_data_train1 = pd.read_csv('drive/MyDrive/train.csv') #10886
raw_data_test = pd.read_csv('drive/MyDrive/test.csv') #6493

raw_data_train = pd.concat([raw_data_train1,raw_data_test]) #this one is actually the train and test put together but im to lazy to change all names sorry


#raw_data_train.describe()



In [None]:
with open('drive/MyDrive/totaloutput.csv', 'w', encoding = 'utf-8-sig') as f:
  raw_data_train.to_csv(f)

In [None]:
with open('drive/MyDrive/head.csv', 'w', encoding = 'utf-8-sig') as f:
  raw_data_train.head().to_csv(f)


# **Preprocess Data**

In [None]:
# Data has both categorial data and numerical data, but for this tutorial 
# we will use just the numerical data.
numeric_columns = []
numeric_columns.extend(list(raw_data_train.dtypes[raw_data_train.dtypes == np.int64].index))
numeric_columns.extend(list(raw_data_train.dtypes[raw_data_train.dtypes == np.float64].index))


# Remove atemp since its basically temp since we don't need it
numeric_columns.remove('atemp')

numeric_data_tr = DataFrame(raw_data_train, columns=numeric_columns)

nan_columns_tr = np.any(pd.isna(numeric_data_tr), axis = 0)
nan_columns_tr = list(nan_columns_tr[nan_columns_tr == True].index)


# Fill NaN values with 0
for col in nan_columns_tr:
    numeric_data_tr[col] = numeric_data_tr[col].fillna(0)



In [None]:
numeric_x_columns = list(numeric_data_tr.columns)
numeric_x_columns.remove('count')
numeric_x_columns.remove('windspeed')
numeric_x_columns.remove('registered')
numeric_x_columns.remove('casual')   #Removing these ended up reducing performance
numeric_y_columns = ['count']

numeric_tr_x_df = DataFrame(numeric_data_tr, columns=numeric_x_columns)
numeric_tr_y_df = DataFrame(numeric_data_tr, columns=numeric_y_columns)


In [None]:
# NOTE! clicking this will reduce performance
# Noramlize the data!, this did not improve performance weirdly enough

numeric_data_tr_norm = (numeric_data_tr - numeric_data_tr.mean()) / numeric_data_tr.std()
numeric_x_columns = list(numeric_data_tr_norm.columns)
numeric_x_columns.remove('count')
#numeric_x_columns.remove('windspeed')
#numeric_x_columns.remove('registered')
#numeric_x_columns.remove('casual')

numeric_tr_x_df = DataFrame(numeric_data_tr_norm, columns=numeric_x_columns)
numeric_tr_x_df.head()
numeric_y_columns = ['count']
numeric_tr_y_df = DataFrame(numeric_data_tr, columns=numeric_y_columns)


# **Define DataModule for PL**

In [None]:
class BikeSharingDemandDataset(torch.utils.data.Dataset):
    def __init__(self, csv_file_x, csv_file_y):
        self.csv_file_x = torch.tensor(csv_file_x.values, dtype=torch.float)
        self.csv_file_y = torch.tensor(csv_file_y.values, dtype=torch.float)

    def __len__(self):
        return len(self.csv_file_x)

    def __getitem__(self, idx):
        x = self.csv_file_x[idx]
        y = self.csv_file_y[idx]
        return {'x': x, 'y': y}


class BikeSharingDemandDataModule(pl.LightningDataModule):

    def __init__(self, **kwargs):
        super().__init__()

        self.batch_size = kwargs.get('batch_size')
        self.num_workers = kwargs.get('num_workers', 0)
        self.val_ratio = kwargs.get('val_ratio')

        error_msg = "[!] valid_size should be in the range [0, 1]."
        assert ((self.val_ratio >= 0) and (self.val_ratio <= 1)), error_msg
        # Data: data transformation strategy



        num_train = 10886
        indices = list(range(num_train))
        split = int(np.floor(self.val_ratio * num_train))

        np.random.shuffle(indices)

        train_idx, valid_idx = indices[split:], indices[:split]
        test_idx = list(range(10887, 17379))

        self.dataset_tr = BikeSharingDemandDataset(numeric_tr_x_df.iloc[train_idx], numeric_tr_y_df.iloc[train_idx])
        self.dataset_val = BikeSharingDemandDataset(numeric_tr_x_df.iloc[valid_idx], numeric_tr_y_df.iloc[valid_idx])
        self.dataset_te = BikeSharingDemandDataset(numeric_tr_x_df.iloc[test_idx], numeric_tr_y_df.iloc[test_idx])



    def train_dataloader(self):
        return DataLoader(self.dataset_tr, batch_size=self.batch_size, num_workers=self.num_workers)

    def val_dataloader(self):
        return DataLoader(self.dataset_val, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False)

    def test_dataloader(self):
        return DataLoader(self.dataset_te, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False)

# **Define Model**

In [None]:
class Regression(pl.LightningModule):
    def __init__(self, **kwargs):
        super().__init__()
        self.lr = kwargs.get('lr')

        self.linear1 = nn.Linear(6, 100)
        self.linear2 = nn.Linear(100, 200)
        self.linear3 = nn.Linear(200, 200)
        self.linear4 = nn.Linear(200, 1)
        self.relu = nn.ReLU()
        self.loss = RMSLELoss()

        self.save_hyperparameters()

    def forward(self, x):
        y_pred = self.relu(self.linear1(x))
        y_pred = self.relu(self.linear2(y_pred))
        y_pred = self.relu(self.linear3(y_pred))
        y_pred = self.relu(self.linear4(y_pred))
        return y_pred


    def training_step(self, batch, batch_idx):
        x, y = batch['x'], batch['y']
        y_hat = self(x)
        loss = self.loss(y_hat, y)
        self.log('Training loss', loss.item())
        return loss

    def on_validation_start(self):
        self.losses = []
        self.num_samples = 0


    def validation_step(self, batch, batch_idx):
        x, y = batch['x'], batch['y']
        y_hat = self(x)
        loss = torch.pow(torch.log(y_hat + 1)-torch.log(y + 1),2)
        self.num_samples += x.size(0)
        self.losses.append(loss.sum().item())
        return loss

    def on_validation_epoch_end(self):
        overall_loss = np.sqrt(np.sum(self.losses)/self.num_samples)
        self.log('Validation loss', overall_loss)

        

    def on_test_start(self):
        self.losses = []
        self.num_samples = 0

    def test_step(self, batch, batch_idx):
        x, y = batch['x'], batch['y']
        y_hat = self(x)
        loss = torch.pow(torch.log(y_hat + 1)-torch.log(y + 1),2)
        self.num_samples += x.size(0)
        self.losses.append(loss.sum().item())
        return loss

    def on_test_epoch_end(self):
        overall_loss = np.sqrt(np.sum(self.losses)/self.num_samples)
        self.log('Test loss', overall_loss)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)

# **Define Training Configuration**

In [None]:
dict_args = {
    'dataloader': BikeSharingDemandDataModule,
    'load': None,
    'resume_from_checkpoint': None,
    'batch_size': 32,
    'epoch': 50,
    'num_workers': 0, 
    'val_freq': 0.5, 
    'logdir': './logs',
    'lr': 0.001, 
    'display_freq': 64,
    'seed': 42, 
    'clip_grad_norm': 0, 
    'val_ratio': 0.2
}

In [None]:
# Define seed for reproducibility
pl.seed_everything(dict_args['seed'])
# Initialize model to train

if dict_args['load'] is not None:
    model = Regression.load_from_checkpoint(dict_args['load'], **dict_args)
else:
    model = Regression(**dict_args)

# Initialize logging paths
now = datetime.datetime.now().strftime('%m%d%H%M%S')
print(now)
weight_save_dir = os.path.join(dict_args["logdir"], os.path.join('models', 'state_dict', now))


os.makedirs(weight_save_dir, exist_ok=True)

# Callback: model checkpoint strategy
checkpoint_callback = ModelCheckpoint(
    dirpath=weight_save_dir, save_top_k=5, verbose=True, monitor="Validation loss", mode="min"
)

# Data: load data module
data_module = dict_args['dataloader'](**dict_args)

# Trainer: initialize training behaviour

logger = TensorBoardLogger(save_dir=dict_args['logdir'], version=now, name='lightning_logs', log_graph=True)
trainer = pl.Trainer(
    callbacks=[checkpoint_callback],
    val_check_interval=dict_args['val_freq'],
    deterministic=True,
    logger=logger,
    max_epochs=dict_args["epoch"],
    log_every_n_steps=dict_args["display_freq"],
    gradient_clip_val=dict_args['clip_grad_norm'],
    #resume_from_checkpoint=dict_args['resume_from_checkpoint']
)

# **Train the model**

In [None]:
trainer.fit(model, data_module)

# **Test the model**

In [None]:
trainer.test(model, ckpt_path='best', datamodule=data_module)