# Training

In [93]:
# Import our model
from model import MVP

# Libraries for the data
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

# Get the sector tickers
from sector import *

# Process the data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
tks = get_ticker_dict(sectors)
descs = get_desc_dict(sectors)

Get our data

In [3]:
sectors_data = {}
for k in tks.keys():
    sectors_data[k] = pd.read_csv('data/sectors/TA/{}_7yr_daily.csv'.format(k))
    sectors_data[k].index = pd.to_datetime(sectors_data[k]["date"])
    sectors_data[k].drop(['ticker', 'descr', 'date'], axis=1, inplace=True)

In [4]:
sp5 = pd.read_csv('data/sectors/SP500_7yr_daily.csv')
sp5.index = pd.to_datetime(sp5["date"])
sp5.drop(['ticker', 'descr', 'date'], axis=1, inplace=True)
labels = pd.read_csv('data/sectors/sector_labels.csv')
labels.index = pd.to_datetime(labels["date"])
labels.drop('date', axis=1, inplace=True)
labels.drop('Unnamed: 0', axis=1, inplace=True)

In [5]:
ordering = []
for k in sectors_data.keys():
    ordering += [k]

In [63]:
'''
converts a dataframe to be valid input data and labels by lining up dates and adding a row of data for each sector (and the S&P)
'''
def data_to_XY(sectors, sp5, labels):
    closest_date = None
    
    # 1 entry for every month date
    stacks = len(labels)
    # Number of stocks (sectors & the SP500)
    stack_height = len(sectors.keys()) + 1 
    # 1 col for every feature (subtract ticker column and description column)
    stack_width = len(sectors["XLB"].columns)
    
    X = torch.zeros((stacks, stack_height, stack_width), dtype=torch.float64)
    
    # Go through dates and gather rows of each sector & sp5
    for i, date in enumerate(labels.index):
        newdf = sp5.index[sp5.index <= date]

        ### Add SP5 data to row
        # Get the last date before the new month
        closest_date = newdf.values[-1]
        
        # Do i need to deep copy here?
        row = torch.zeros((stack_width))
        row[0:2] = torch.tensor(sp5.iloc[sp5.index == closest_date].values[0])
        X[i,0] = row
        
        row_counter = 0
        ### Add sectors' data to row
        for k in sectors.keys():
            sector = sectors[k]
            row_counter += 1
            sector_row = torch.zeros((stack_width))
            
            # Already have the last date, make sure the sectors align with that date (have a row of information for each date)
            # If the sector started at a date beyond our last date, ignore it (all 0s)
            if sector.index.values[0] > closest_date:
                # print("No {} data at/before date: {}... settings to 0s".format(k, closest_date))
                X[i,row_counter] = sector_row
            else:
                
                # Should expect to have data here. If not, this is a problem (has SP5 data and previous sector data, but this would imply a hole in the data)
                if len(sector.iloc[sector.index == closest_date]) != 1:
                    print("ERROR: No {} data for date {} where expected. Using previous day of {}.".format(k, closest_date, sector.iloc[sector.index < closest_date].index[-1]))
                    sector_row = torch.tensor(sector.iloc[sector.index < closest_date].values[-1,:])
                else:
                    sector_row = torch.tensor(sector.iloc[sector.index == closest_date].values[0])
                
                X[i,row_counter] = sector_row
            
    # Prepare the labels
    Y = torch.zeros((labels.shape[0], labels.shape[1]), dtype=torch.float64)
    for i, col in enumerate(labels.columns):
        index = ordering.index(descs[col])
        Y[:,index] = torch.tensor(labels[col].values)
    
    return X, Y
    
X, Y = data_to_XY(sectors_data, sp5, labels)

ERROR: No XLP data for date 2015-06-01T00:00:00.000000000 where expected. Using previous day of 2015-05-29 00:00:00.
2015-05-29 00:00:00
ERROR: No XLP data for date 2015-10-01T00:00:00.000000000 where expected. Using previous day of 2015-09-30 00:00:00.
2015-09-30 00:00:00
ERROR: No XLRE data for date 2017-09-01T00:00:00.000000000 where expected. Using previous day of 2017-08-31 00:00:00.
2017-08-31 00:00:00
ERROR: No XLB data for date 2017-09-29T00:00:00.000000000 where expected. Using previous day of 2017-09-28 00:00:00.
2017-09-28 00:00:00
ERROR: No XLP data for date 2018-06-01T00:00:00.000000000 where expected. Using previous day of 2018-05-31 00:00:00.
2018-05-31 00:00:00
ERROR: No XLF data for date 2018-06-29T00:00:00.000000000 where expected. Using previous day of 2018-06-28 00:00:00.
2018-06-28 00:00:00
ERROR: No XLF data for date 2018-08-01T00:00:00.000000000 where expected. Using previous day of 2018-07-31 00:00:00.
2018-07-31 00:00:00
ERROR: No XLRE data for date 2019-05-01T

### Examine Data
Our X data consists of 3 dimensions: (monthly data points, each sector of the S&P500 and the S&P500 itself, TA features for each stock)

In [10]:
X.shape

(82, 12, 94)

Our Y data consists of 2 dimensions (monthly data points, 1 value for each of the sectors). Note that the sector dimension is 1 less than the X data (no labels for S&P500 data)

In [17]:
Y.shape

torch.Size([82, 11])

### Train/Test Split

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=226)

### Scale our data

In [67]:
sclr = StandardScaler()
num_instances, num_stocks, num_features = X_train.shape
X_train = X_train.reshape((-1, num_features))
X_train = sclr.fit_transform(X_train)

In [68]:
# Return X_train data to how it was
X_train = torch.from_numpy(X_train.reshape((num_instances, num_stocks, num_features)))

In [90]:
# This will be for testing
num_instances, num_stocks, num_features = X_test.shape
X_test = X_test.reshape((-1, num_features))
X_test = sclr.transform(X_test)
X_test = torch.from_numpy(X_test.reshape((num_instances, num_stocks, num_features)))

### Dataset and Dataloader

In [88]:
class StockDataset(torch.utils.data.Dataset):
  def __init__(self, X, Y):
    self.X = X
    self.Y = Y

  def __len__(self):
    return len(self.X)

  def __getitem__(self, index):
    return self.X[index], self.Y[index]

In [104]:
train_dataset = StockDataset(X_train, y_train)
test_dataset = StockDataset(X_test, y_test)
params = {'batch_size': 1,
          'shuffle': True,
          'num_workers': 0}
train_dataloader = torch.utils.data.DataLoader(train_dataset, **params)
test_dataloader = torch.utils.data.DataLoader(test_dataset, **params)

In [120]:
# Input Dimension is the number of features * number of stocks (because it will be flattened),
# Output Dimension is the number of stocks - 1 (because only predicting if each stock will over/underperform the S&P500, -1 because S&P500 included in data)
# Output Dimension is 1, indicating we want 1 value to determine if each stock will over/underperform the S&P500
model = MVP(X.shape[1] * X.shape[2], Y.shape[1])
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

In [122]:
def train(model, optimizer, epochs=50, ret_loss=False):

  criterion = nn.CrossEntropyLoss()
  loss_data = torch.zeros((epochs, len(train_dataloader)))
  for e in range(epochs):

    run_loss, ct = 0, 0
    batch_loss = 0.0
    for i, data in enumerate(train_dataloader, 0):
      X, Y = data
      X = X.float()
      optimizer.zero_grad()

      pred = model(X)
      loss = criterion(pred.reshape(-1), Y.reshape(-1))
      loss.backward()
      optimizer.step()

      # print statistics
      l = loss.item()
      loss_data[e,i] = l
      batch_loss += l
      run_loss += l
      ct += 1
      if i % 10 == 9:    # print every 10 mini-batches
          print(f'[{e + 1}, {i + 1:5d}] loss: {batch_loss / 2000:.3f}')
          batch_loss = 0.0
    print(f"Epoch loss: {run_loss / len(train_dataloader):.3f}")
  print('Finished Training')
  if ret_loss:
    return loss_data

In [123]:
train(model, optimizer)

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [86]:
out = model(X_train[0].float())