In [1]:
import os
import pandas as pd
import pytorch_lightning as pl
import torch
import torch.nn.functional as F
import torch.optim as optim

from argparse import Namespace
from torch import nn
from torch.utils.data import Dataset, DataLoader, random_split, Subset

# Model

In [2]:
class TestDataset(Dataset):
    def __init__(self, data_path):
        self.targets_df = pd.read_feather('data/f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_text_norm.feather')
        
    def __len__(self):
        return len(self.targets_df)
    
    def __getitem__(self, idx):
        line = self.targets_df.iloc[idx]
        
        # t
        car_0_30 = line.car_0_30
        car_0_30_norm = line.car_0_30_norm
        
        # x
        sue = line.sue
        sest = line.sest
        alpha = line.alpha
        volatility = line.volatility
        mcap = line.mcap/1e6
        bm = line.bm
        roa = line.roa
        debt_asset = line.debt_asset
        numest = line.numest
        smedest = line.smedest
        sstdest = line.sstdest
        car_m1_m1 = line.car_m1_m1
        car_m2_m2 = line.car_m2_m2
        car_m30_m3 = line.car_m30_m3
        volume = line.volume

        return torch.tensor(car_0_30,dtype=torch.float32), \
               torch.tensor(car_0_30_norm,dtype=torch.float32), \
               torch.tensor([alpha, car_m1_m1, car_m2_m2, car_m30_m3, sest, sue, numest, sstdest, smedest, mcap, roa, bm, debt_asset, volatility, volume], dtype=torch.float32)

In [3]:
ds = TestDataset(data_path='data/f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_text_norm.feather')

In [4]:
class Test(pl.LightningModule):
    def __init__(self, hparams):
        super().__init__()
        
        hparams = Namespace(**hparams)
        
        self.hparams = hparams
        
        # only one layer
        self.hidden_1 = nn.Linear(15, 64)
        self.hidden_2 = nn.Linear(64, 64)
        self.output = nn.Linear(64, 1)
        
    def prepare_data(self):
        full_dataset = TestDataset(self.hparams.data_path)
        n_dataset = len(full_dataset)
        n_train = int(n_dataset*0.8)
        n_val = n_dataset - n_train
        
        train_idx = range(n_train)
        val_idx = range(n_train, len(full_dataset))
        
        assert len(train_idx)+len(val_idx)==len(full_dataset)
        
        print(f'n_train={n_train}, n_val={n_val}')
        
        self.train_dataset = Subset(full_dataset, train_idx)
        self.val_dataset = Subset(full_dataset, val_idx)
        self.test_dataset = self.train_dataset
        
    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.hparams.batch_size, shuffle=True, drop_last=True, num_workers=0, pin_memory=True)
        
    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.hparams.val_batch_size, shuffle=False, drop_last=True, num_workers=0, pin_memory=True)
    
    def test_dataloader(self):
        return DataLoader(self.test_dataset, num_workers=0, pin_memory=True)
    
    def forward(self, fin_ratio):
        x = self.hidden_1(fin_ratio)
        x = self.hidden_2(x)
        y = self.output(x)
        
        return y
    
    def loss(self, y, t):
        return F.mse_loss(y, t)
    
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
        return optimizer   
    
    def training_step(self, batch, idx):
        car, car_norm, fin_ratio = batch
        
        y_car = self.forward(fin_ratio)
        loss = self.loss(y_car, car.unsqueeze(-1))
        
        return {'loss': loss, 'log': {'train_loss': loss}}
    
    def validation_step(self, batch, idx):
        car, car_norm, fin_ratio = batch
        
        y_car = self.forward(fin_ratio)
        loss = self.loss(y_car, car.unsqueeze(-1))
        
        return {'val_loss': loss}
    
    def test_step(self, batch, idx):
        car, car_norm, fin_ratio = batch
        
        y_car = self.forward(fin_ratio)
        loss = self.loss(y_car, car.unsqueeze(-1))
        
        return {'test_loss': loss}
    
    
    def validation_epoch_end(self, outputs):
        mse = torch.stack([x['val_loss'] for x in outputs]).mean()
        rmse = torch.sqrt(mse)
        return {'val_loss': mse, 'log': {'val_rmse': rmse}}   
    
    def test_epoch_end(self, outputs):
        mse = torch.stack([x['test_loss'] for x in outputs]).mean()
        rmse = torch.sqrt(mse)

        return {'test_loss': mse, 'log': {'test_rmse': rmse}, 'progress_bar':{'test_rmse': rmse}}

# Trainer

In [19]:
# hparams
model_hparams = {
    'data_path': 'data/f_sue_keydevid_car_finratio_vol_transcriptid_sim_inflow_revision_text_norm.feather',
    'learning_rate': 1e-3,
    'val_batch_size': 16, 
    'batch_size': 128
}

train_hparams = {
    'max_epochs': 2,
    'note': 'car',
    'save_top_k': 5,
    'ckpt_period': 1, # save every epoch
    'ckpt_path': 'd:/Checkpoints/earnings-call/test_ckpt'
}

# logger
logger = pl.loggers.CometLogger(
    api_key=os.getenv('COMET_API_KEY'),
    project_name='earnings-call',
    workspace='amiao',
    display_summary_level=0)

# checkpoint
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    verbose=True,
    mode='min',
    monitor='val_loss',
    filepath=train_hparams['ckpt_path'],
    prefix=f"{train_hparams['note']}_",
    save_top_k=train_hparams['save_top_k'],
    period=train_hparams['ckpt_period'])

    # logger

# trainner
trainer = pl.Trainer(
    checkpoint_callback=checkpoint_callback,
    gpus=-1, 
    progress_bar_refresh_rate=2, 
    distributed_backend='dp', 
    max_epochs=train_hparams['max_epochs'], 
    logger=logger)

# init model
model = Test(model_hparams)

# upload logger
logger.experiment.log_parameters(model_hparams)
logger.experiment.log_parameters(train_hparams)

# fit
trainer.fit(model)

CometLogger will be initialized in online mode
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0,1]
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/amiao/earnings-call/36ddc6b9ef1c4fd6a63c187f587cb21e


  | Name     | Type   | Params
------------------------------------
0 | hidden_1 | Linear | 1 K   
1 | hidden_2 | Linear | 4 K   
2 | output   | Linear | 65    


n_train=17410, n_val=4353


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…


Epoch 00000: val_loss reached 156.67194 (best 156.67194), saving model to d:/Checkpoints/earnings-call/test_ckpt\car_epoch=0.ckpt as top 5


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…


Epoch 00001: val_loss reached 153.81688 (best 153.81688), saving model to d:/Checkpoints/earnings-call/test_ckpt\car_epoch=1.ckpt as top 5





COMET INFO: Uploading stats to Comet before program termination (may take several seconds)


1

In [23]:
trainer.test(ckpt_path='best')

FileNotFoundError: [Errno 2] No such file or directory: 'd:/Checkpoints/earnings-call/test_ckpt\\car_epoch=1.ckpt'