In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import os
import datetime
from sklearn.preprocessing import StandardScaler

class LinearRegression(nn.Module):
    def __init__(self, input_dim):
        super(LinearRegression, self).__init__()
        self.linear = nn.Linear(input_dim, 1, bias=False)
    
    def forward(self, x):
        return self.linear(x)

class Lasso(nn.Module):
    def __init__(self, input_dim, alpha=1.0):
        super(Lasso, self).__init__()
        self.linear = nn.Linear(input_dim, 1, bias=False)
        self.alpha = alpha
    
    def forward(self, x):
        return self.linear(x)
    
    def l1_loss(self):
        return self.alpha * torch.sum(torch.abs(self.linear.weight))

class Ridge(nn.Module):
    def __init__(self, input_dim, alpha=1.0):
        super(Ridge, self).__init__()
        self.linear = nn.Linear(input_dim, 1, bias=False)
        self.alpha = alpha
    
    def forward(self, x):
        return self.linear(x)
    
    def l2_loss(self):
        return self.alpha * torch.sum(self.linear.weight ** 2)

class ElasticNet(nn.Module):
    def __init__(self, input_dim, alpha=1.0, l1_ratio=0.5):
        super(ElasticNet, self).__init__()
        self.linear = nn.Linear(input_dim, 1, bias=False)
        self.alpha = alpha
        self.l1_ratio = l1_ratio
    
    def forward(self, x):
        return self.linear(x)
    
    def elastic_net_loss(self):
        l1 = self.l1_ratio * torch.sum(torch.abs(self.linear.weight))
        l2 = (1 - self.l1_ratio) * torch.sum(self.linear.weight ** 2)
        return self.alpha * (l1 + l2)

def train_model(model, X, y, epochs=1000, lr=0.01, device='cpu'):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    model = model.to(device)
    X = X.to(device)
    y = y.to(device)
    
    for epoch in range(epochs):
        y_pred = model(X)
        loss = criterion(y_pred, y)
        
        if isinstance(model, Lasso):
            loss += model.l1_loss()
        elif isinstance(model, Ridge):
            loss += model.l2_loss()
        elif isinstance(model, ElasticNet):
            loss += model.elastic_net_loss()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

def predict(model, X, device='cpu'):
    model.eval()
    with torch.no_grad():
        X = X.to(device)
        return model(X).cpu().numpy()

if __name__ == "__main__":
    # Check if CUDA is available
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    print(datetime.datetime.now())

    # Set working directory and read data
    work_dir = "."
    raw = pd.read_csv(os.path.join(work_dir, "hackathon_sample_v2.csv"), parse_dates=["date"], low_memory=False)
    stock_vars = list(pd.read_csv(os.path.join(work_dir, "factor_char_list.csv"))["variable"].values)
    ret_var = "stock_exret"
    new_set = raw[raw[ret_var].notna()].copy()

    # Transform variables
    data = pd.DataFrame()
    for _, monthly_raw in new_set.groupby("date"):
        group = monthly_raw.copy()
        for var in stock_vars:
            group[var] = group[var].fillna(group[var].median())
            group[var] = (group[var].rank(method="dense") - 1) / (group[var].rank(method="dense").max() - 1) * 2 - 1
        data = data._append(group, ignore_index=True)

    # Initialize variables
    starting = pd.to_datetime("20000101", format="%Y%m%d")
    counter = 0
    pred_out = pd.DataFrame()

    while (starting + pd.DateOffset(years=11 + counter)) <= pd.to_datetime("20240101", format="%Y%m%d"):
        cutoff = [
            starting,
            starting + pd.DateOffset(years=8 + counter),
            starting + pd.DateOffset(years=10 + counter),
            starting + pd.DateOffset(years=11 + counter),
        ]

        train = data[(data["date"] >= cutoff[0]) & (data["date"] < cutoff[1])]
        validate = data[(data["date"] >= cutoff[1]) & (data["date"] < cutoff[2])]
        test = data[(data["date"] >= cutoff[2]) & (data["date"] < cutoff[3])]

        scaler = StandardScaler().fit(train[stock_vars])
        train[stock_vars] = scaler.transform(train[stock_vars])
        validate[stock_vars] = scaler.transform(validate[stock_vars])
        test[stock_vars] = scaler.transform(test[stock_vars])

        X_train = torch.FloatTensor(train[stock_vars].values)
        Y_train = torch.FloatTensor(train[ret_var].values).unsqueeze(1)
        X_val = torch.FloatTensor(validate[stock_vars].values)
        Y_val = torch.FloatTensor(validate[ret_var].values).unsqueeze(1)
        X_test = torch.FloatTensor(test[stock_vars].values)

        Y_mean = Y_train.mean().item()
        Y_train_dm = Y_train - Y_mean

        reg_pred = test[["year", "month", "date", "permno", ret_var]]

        # Linear Regression
        model = LinearRegression(len(stock_vars))
        train_model(model, X_train, Y_train_dm, device=device)
        reg_pred["ols"] = predict(model, X_test, device) + Y_mean

        # Lasso
        best_mse = float('inf')
        best_lambda = 0
        for lambda_ in np.logspace(-4, 4, 81):
            model = Lasso(len(stock_vars), alpha=lambda_)
            train_model(model, X_train, Y_train_dm, device=device)
            val_pred = predict(model, X_val, device) + Y_mean
            mse = ((Y_val.numpy() - val_pred) ** 2).mean()
            if mse < best_mse:
                best_mse = mse
                best_lambda = lambda_
        
        model = Lasso(len(stock_vars), alpha=best_lambda)
        train_model(model, X_train, Y_train_dm, device=device)
        reg_pred["lasso"] = predict(model, X_test, device) + Y_mean

        # Ridge
        best_mse = float('inf')
        best_lambda = 0
        for lambda_ in np.logspace(-1, 8, 91):
            model = Ridge(len(stock_vars), alpha=lambda_ * 0.5)
            train_model(model, X_train, Y_train_dm, device=device)
            val_pred = predict(model, X_val, device) + Y_mean
            mse = ((Y_val.numpy() - val_pred) ** 2).mean()
            if mse < best_mse:
                best_mse = mse
                best_lambda = lambda_
        
        model = Ridge(len(stock_vars), alpha=best_lambda * 0.5)
        train_model(model, X_train, Y_train_dm, device=device)
        reg_pred["ridge"] = predict(model, X_test, device) + Y_mean

        # Elastic Net
        best_mse = float('inf')
        best_lambda = 0
        for lambda_ in np.logspace(-4, 4, 81):
            model = ElasticNet(len(stock_vars), alpha=lambda_)
            train_model(model, X_train, Y_train_dm, device=device)
            val_pred = predict(model, X_val, device) + Y_mean
            mse = ((Y_val.numpy() - val_pred) ** 2).mean()
            if mse < best_mse:
                best_mse = mse
                best_lambda = lambda_
        
        model = ElasticNet(len(stock_vars), alpha=best_lambda)
        train_model(model, X_train, Y_train_dm, device=device)
        reg_pred["en"] = predict(model, X_test, device) + Y_mean

        pred_out = pred_out._append(reg_pred, ignore_index=True)
        counter += 1

    out_path = os.path.join(work_dir, "output.csv")
    print(out_path)
    pred_out.to_csv(out_path, index=False)

    yreal = pred_out[ret_var].values
    for model_name in ["ols", "lasso", "ridge", "en"]:
        ypred = pred_out[model_name].values
        r2 = 1 - np.sum(np.square((yreal - ypred))) / np.sum(np.square(yreal))
        print(model_name, r2)

    print(datetime.datetime.now())

Using device: cuda:0
2024-09-19 22:09:20.197879


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reg_pred["ridge"] = predict(model, X_test, device) + Y_mean
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reg_pred["en"] = predict(model, X_test, device) + Y_mean
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[stock_vars] = scaler.transform(train[stock_vars])
A value is trying to be set on a

.\output.csv
ols -0.04622827546802166
lasso -0.004722764993619677
ridge 0.007212463159129268
en 0.002862624254661439
2024-09-19 23:01:29.812045


In [2]:
df = pd.read_csv('./output.csv')

In [3]:
df

Unnamed: 0,year,month,date,permno,stock_exret,ols,lasso,ridge,en
0,2010,1,2010-01-29,10104,-0.057888,0.000424,0.004420,0.002404,0.002426
1,2010,1,2010-01-29,10107,-0.075459,0.015691,0.000337,0.007611,0.010942
2,2010,1,2010-01-29,10137,-0.107751,-0.002534,0.000902,0.001287,-0.005883
3,2010,1,2010-01-29,10138,-0.068169,-0.011472,-0.010292,0.000826,0.006086
4,2010,1,2010-01-29,10143,0.101739,-0.000621,-0.016388,-0.000750,0.005567
...,...,...,...,...,...,...,...,...,...
158975,2023,12,2023-12-29,93330,-0.022199,0.000503,0.019592,0.006229,-0.004884
158976,2023,12,2023-12-29,93356,0.120845,0.003928,-0.007402,0.008144,0.019635
158977,2023,12,2023-12-29,93369,-0.005245,0.004917,0.001717,0.007141,-0.001952
158978,2023,12,2023-12-29,93374,0.085801,0.012367,0.020208,0.006956,-0.005012
