# A Hybrid Learning Approach to Synthetic Position Construction for Tax Loss Harvesting

# Generation of Synthetic Positions Using Hybrid Learning

In [179]:
import pickle
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import SparsePCA
import statsmodels.api as sm
import operator
from deprecated import deprecated

corr_matrix_file_path = "pickles/corr_matrix.obj"
x_labels_file_path = "pickles/X_labels.obj"
sptm_comp_file_path = "pickles/sptm_composition.obj"
sptm_price_file_path = "pickles/sptm_price.obj"

In [180]:
# Composition data from pickle
sptm_comp_file = open(sptm_comp_file_path, 'rb')
sptm_composition = pickle.load(sptm_comp_file)
sptm_comp_file.close()

# Price history downloaded from Yahoo Finance and stored in pickle
sptm_price_file = open(sptm_price_file_path, 'rb')
sptm_price_history = pickle.load(sptm_price_file)
sptm_price_file.close()

all_tickers = sptm_composition.keys()

# Gets a list of all valid trading dates
all_dates = sptm_price_history[list(all_tickers)[0]].axes[0].values
dates = pd.DatetimeIndex(data=all_dates)
current_date = all_dates[0]

# 80/20 Test/Train Split
# Data Range: 2015-01-02 - 2020-08-10
# Testing Range: 2020-08-11 - 2021-12-31
train_start_date = 0
train_end_date = 1410
test_start_date = 1411
test_end_date = 1762

# Choose how many days should be used to validate the hyperparameters
num_validation_days = 30

### Finding Highly Correlated Stocks
#### Use the *do_corr_matrix_regen* variable to tell the code to generate the correlation matrix, else it will load it from a pickle

In [181]:
num_stocks = len(all_tickers)


@deprecated(reason="Switched to generating singular correlation lists on the fly")
def regenerate_corr_matrix(end_date, recent_days):
    """Calculates the correlation between every stock's percent time series
    :param end_date: The last date for which data should be used to generate the correlation matrix
    :param recent_days: How many days of price history should be used to calculate the correlation matrix
    """
    x_vars = []
    x_labels = []

    for ticker in all_tickers:
        x_vars.append(sptm_price_history[ticker].Close.pct_change().iloc[end_date - recent_days:end_date])
        x_labels.append(ticker)

    correlation_matrix = [[0 for _ in range(0, num_stocks)] for _ in range(0, num_stocks)]

    for l in range(0, num_stocks):
        for m in range(0, num_stocks):
            correlation_matrix[l][m] = x_vars[l].corr(x_vars[m])
    correlation_matrix = pd.DataFrame(correlation_matrix, columns=x_labels[0:num_stocks])

    correlation_matrix_file = open(corr_matrix_file_path, 'wb')
    pickle.dump(correlation_matrix, correlation_matrix_file)
    correlation_matrix_file.close()

    x_labels_file = open(x_labels_file_path, 'wb')
    pickle.dump(x_labels, x_labels_file)
    x_labels_file.close()


@deprecated(reason="Switched to generating singular correlation lists on the fly")
def load_corr_matrix():
    """Loads correlation matrix and labels from pickles
    :returns: [correlation_matrix, x_labels]
    """
    # Covariance Matrix from pickle
    corr_matrix_file = open(corr_matrix_file_path, 'rb')
    correlation_matrix = pickle.load(corr_matrix_file)
    corr_matrix_file.close()

    # X_Labels from pickle
    x_labels_file = open(x_labels_file_path, 'rb')
    x_labels = pickle.load(x_labels_file)
    x_labels_file.close()

    return correlation_matrix, x_labels


def corr_list_for_single_stock(ticker, end_date, recent_days):
    x_labels = []

    ticker_price_history = sptm_price_history[ticker].Close.pct_change().iloc[end_date - recent_days:end_date]

    for tickers in sptm_price_history.keys():
        x_labels.append(tickers)

    correlation_matrix = {}

    for l in range(0, len(x_labels)):
        correlation_matrix[x_labels[l]] = ticker_price_history.corr(
            sptm_price_history[x_labels[l]].Close.pct_change().iloc[end_date - recent_days:end_date])
    correlation_matrix = dict(sorted(correlation_matrix.items(), key=operator.itemgetter(1), reverse=True))

    return correlation_matrix


In [182]:
def filter_substantially_similar_securities(ticker, series, additional_stocks=None):
    """Filters out the stocks you cannot buy because of a wash sale
    :param series: The series of correlated stocks that should be filtered
    :param ticker: Ticker to filter out securities for
    :param additional_stocks: Any additional stocks you want to filter out (default = None)
    """
    if additional_stocks is None:
        additional_stocks = []

    # Handles Google having both GOOG and GOOGL in the dataset
    if ticker == "GOOG" or ticker:
        additional_stocks.append("GOOG")
        additional_stocks.append("GOOGL")

    additional_stocks.append(ticker)

    for remove_stock in additional_stocks:
        # for r_stock in remove_stock:
        if remove_stock in series:
            series.pop(series.index(remove_stock))

    return series


def find_top_correlated_stocks(correlation_list, ticker, n):
    """Finds the top n stocks correlated with a given stock
    :param correlation_list: The correlation list to get the top correlated stocks from
    :param n: the number of correlated stocks to return
    :param ticker: The ticker for which correlated stocks should be found
    :returns: A list (sorted by most correlation) of correlated stocks
    """

    filtered_correlations = filter_substantially_similar_securities(ticker, list(correlation_list.keys()))

    return filtered_correlations[:n]

### Principle Component Analysis

In [183]:
def get_price_histories_dataframe(stocks, target_stock, end_date, recent_days):
    """Compiles a DataFrame with the price histories of the specified stocks
    :param target_stock: The stock for which the dataset should be created
    :param stocks: The stocks to include in the DataFrame
    :param end_date: The last date for which data should be used to get the price history
    :param recent_days: How many days of price history should be fetched
    :returns: (Features DataFrame, Target DataFrame)"""
    d = {}
    for correlated_stock in stocks:
        d[correlated_stock] = sptm_price_history[correlated_stock].Close.values[end_date - recent_days:end_date]
    target = pd.DataFrame(
        {target_stock: sptm_price_history[target_stock].Close.values[end_date - recent_days:end_date]})
    return pd.DataFrame(data=d), target


def get_datasets(correlation_list, target_stock, n, end_date, recent_days):
    """Compiles the necessary datasets for PCA. One with the price history of the features, and one with the price history of the target
    :param correlation_list: The correlation list to get the top correlated stocks from
    :param target_stock: The stock for which the dataset should be created
    :param n: the number of datasets to get
    :param end_date: The last date for which data should be used to get the price history
    :param recent_days: How many days of price history should be fetched
    :returns: (x, y) where x and y are DataFrames
    """
    corr_stocks = find_top_correlated_stocks(correlation_list, target_stock, n)
    return get_price_histories_dataframe(corr_stocks, target_stock, end_date, recent_days)

In [544]:
def predict_portfolio(pred_stock, n_components, n_days, end_date, n_stocks, is_sparse, regression_only = False):
    """Constructs a portfolio using the given hyperparameters
    :param regression_only: Do not perform any PCA
    :param pred_stock: The stock to predict
    :param n_components: The number of principal components the model should use
    :param n_days: The number of days of data the model should consider
    :param end_date: The last date of data the model should use
    :param n_stocks: The number of stocks to be used by principal component analysis
    :param is_sparse: Whether PCA should be sparse or not
    """
    correlation_list = corr_list_for_single_stock(pred_stock, end_date, n_days)

    # Get price history datasets
    x, y = get_datasets(correlation_list, pred_stock, n_stocks, end_date, n_days)

    # Standardize the features
    X = StandardScaler().fit_transform(x)

    # Do the PCA
    if is_sparse:
        pca = SparsePCA(n_components=n_components)
    else:
        pca = PCA(n_components=n_components)

    principal_components = pca.fit_transform(X)
    principal_df = pd.DataFrame(data=principal_components, columns=["PC " + str(g) for g in range(0, n_components)])

    # Linear Regression of principal components
    sm.add_constant(principal_df)
    if regression_only:
        model = sm.OLS(y, x).fit()
        synthetic_position = {}
        for param in model.params.index:
            synthetic_position[param] = model.params[param]
        return synthetic_position

    model = sm.OLS(y, principal_df).fit()
    #print(principal_df)
    #print(y)
    #print(model.summary())

    #print(model.summary())


    loadings = pd.DataFrame(pca.components_.T, index=x.columns)
    loadings_dict = {stock: [] for stock in loadings[0].index.values}

    #print(loadings)

    for c_stock in loadings[0].index.values:
        for pc in range(0, n_components):
            loadings_dict[c_stock].append(loadings[pc][c_stock])

    synthetic_position = {}

    # for c_stock in model.params.index.values:
    #     if model.pvalues[c_stock]<0.05 and model.params[c_stock]>0:
    #         print("adding c_stock")
    #         synthetic_position[c_stock] = model.params[c_stock]
    # return synthetic_position

    if n_components==1:
        for c_stock in loadings_dict.keys():
            synthetic_position[c_stock] = loadings_dict[c_stock][0]
        return synthetic_position

    for c_stock in loadings_dict.keys():
        quantity = 0
        for pc in range(0, n_components):
            quantity += model.params[pc] * loadings_dict[c_stock][pc]
        if quantity > 0:
            synthetic_position[c_stock] = quantity
    #print(synthetic_position)
    return synthetic_position

In [540]:
# sportfolios = simulate_stock('AAPL', test_start_date, test_start_date+30, train_end_date, 360, 2, 10, True, regression_only = True, graph=True)
# print(get_stats(sportfolios))


#when training days are higher, the p values are higher, but the model gets less MSE
#for regression only take into account how you can only use positive values, which is mostly solved by PCA (why?)
#if you wanted to be more accurate, then instead of cancelling the entire replacement if a stock is on the wash asale list, you can just recalculate without that stock (or just pass the wash sale list to the filterer to begin with)
#TODO above

In [539]:
def hybrid_learning_replace(ticker, cur_date) -> {}:
    """Determines which stocks to buy to emulate the performance of another
    :param cur_date: The date on which to find replacements
    :param ticker: The stock for which a synthetic position should be calculated
    :returns: Dictionary of replacement tickers and their quantities
    """
    synthetic_position = predict_portfolio(ticker, 2, 90, cur_date, 15, True)
    return synthetic_position
    #TODO

# Stock Market Backtester

If you want to run this simulator with different stocks and/or generate a new starting portfolio, do the following:
1. Create a CSV file with the format: Ticker, Quantity
2. Run **csvToComp.py** to generate text you can paste in as the *snp_composition* variable in **compToPickle.py**
3. Run **compToPickle.py**
4. Adjust the date range in **yfToPickle.py**
5. Run **yfToPickle.py**, which will take a while, since it is downloading all of the price histories from Yahoo Finance

In [187]:
import pandas as pd
import pyfolio as pf  #install using  pip3 install git+https://github.com/quantopian/pyfolio
from enum import Enum
import warnings
import pickle
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import random
%matplotlib inline

snp_comp_file_path = "pickles/snp_composition.obj"
snp_price_file_path = "pickles/snp_price.obj"

In [188]:
# Composition data from pickle
snp_comp_file = open(snp_comp_file_path, 'rb')
snp_composition = pickle.load(snp_comp_file)
snp_comp_file.close()

# Price history downloaded from Yahoo Finance and stored in pickle
snp_price_file = open(sptm_price_file_path, 'rb')
price_history = pickle.load(snp_price_file)
snp_price_file.close()



# Gets a list of all valid trading dates
all_dates = price_history[list(all_tickers)[0]].axes[0].values
dates = pd.DatetimeIndex(data=all_dates)

In [516]:
def get_stock_price(ticker, day, time) -> float:
    """Retrieves the closing price of a stock on a given date
    :param ticker: stock for which to retrieve the value
    :param day: date on which to retrieve the value
    :param time of day to calculate the value, either Time.OPEN or Time.CLOSE
    """
    if time == Time.OPEN:
        return round(price_history[ticker].Open[day], 2)
    elif time == Time.CLOSE:
        return round(price_history[ticker].Close[day], 2)


class Time(Enum):
    OPEN = 0
    CLOSE = 1

class ReplacementGroup:
    def __init__(self, stock_list, stock):
        self.stock_list = stock_list
        self.counter = 0
        self.stock = stock

class Stock:
    def __init__(self, ticker, cur_date, q=0):
        """Creates an object representing an owned stock
        :param ticker: The stock's ticker symbol
        :param q: How much of this stock is owned (this should only be used when constructing initial stock portfolios, as it essentially creates stock for free)
        """
        self.ticker = ticker
        self.quantity = q
        self.avg_cost = get_stock_price(self.ticker, cur_date, Time.OPEN)

    def buy(self, quantity, day_price):
        """Simulates a portfolio acquiring new stock
        :param quantity: The amount of new stock to acquire
        :param day_price: The price of the stock on the day of acquisition
        """
        previously_owned = self.quantity
        self.quantity += quantity
        if previously_owned==0:
            self.avg_cost=day_price
            return
        self.avg_cost += (previously_owned * self.avg_cost + quantity * day_price) / self.quantity

    def sell(self, q, day_price):
        """Simulates a portfolio selling a stock
        :param q: The amount of new stock to sell
        :param day_price: The price of the stock on the day of acquisition
        """
        previously_owned = self.quantity
        self.quantity -= q
        if self.quantity==0:
            self.avg_cost=0
            return
        self.avg_cost -= (previously_owned * self.avg_cost - q * day_price) / self.quantity

    def sell_all(self):
        """Simulates selling all of a stock
        """
        self.quantity = 0
        self.avg_cost = 0

    def get_pct_change(self, change_date):
        """Gets the percent change of the stock based on the average cost
        """
        change = self.get_change(change_date)
        return 100 * (change / self.avg_cost)

    def get_change(self, change_date) -> float:
        """Gets the difference between the stock's average purchase price and its current price
        :returns: float
        """
        current_price = get_stock_price(self.ticker, change_date, Time.OPEN)
        return current_price - self.avg_cost


class Portfolio:
    def __init__(self, name, sim_start_date, starting_cash_balance: float = 1000000, starting_stocks=None,
                 baseline=False, synthetic=False, replacement=False):
        """Creates a new portfolio
        :param starting_cash_balance: The amount of cash the portfolio should start with (default 100,000.0)
        :param starting_stocks: A dictionary containing the stocks the portfolio should begin with {'Ticker': Stock Object} (default is no starting stocks)
        """
        self.name = name
        if starting_stocks is None:
            starting_stocks = {}
        self.stocks = starting_stocks
        self.cash_balance = starting_cash_balance
        self.value = 0
        self.calculate_value(Time.OPEN, sim_start_date)
        self.price_history = {}
        self.closing_prices = []
        self.returns = None
        self.wash_sale_list = {}
        self.baseline = baseline
        self.synthetic = synthetic
        self.replacement = replacement
        self.diff = []
        self.replacement_groups = []
        self.do_not_replace_list = []

    def calculate_value(self, time, calc_date):
        """Calculates the value of the portfolio based on the close of the global current date
        :param calc_date: The date on whcih to calculate the value
        :param time of day to calculate the value, either Time.OPEN or Time.CLOSE
        """
        self.calculate_value_date(calc_date, time)

    def calculate_value_date(self, c_day, time=Time.OPEN):
        """Calculates the value of the portfolio based on a specified date
        :param c_day: the date on which the portfolio's value should be calculated
        :param time of day to calculate the value, either Time.OPEN or (default) Time.CLOSE
        """

        self.value = self.cash_balance
        for stock in self.stocks.keys():
            self.value += get_stock_price(stock, c_day, time) * self.stocks[stock].quantity

        return self.value

    def update_wash_sale_list(self):
        """Increments the counter for every item in the wash sale list and removes an item if the counter is >30
        """
        remove_from_list = []
        for wash_stock in self.wash_sale_list.keys():
            self.wash_sale_list[wash_stock] += 1
            if self.wash_sale_list[wash_stock] > 30:
                remove_from_list.append(wash_stock)

        for to_remove in remove_from_list:
            self.wash_sale_list.pop(to_remove)

    def update_replacement_groups(self, update_date):
        rgs_to_delete = []
        for rg in self.replacement_groups:
            rg.counter+=1
            if rg.counter >30:
                starting_balance = self.cash_balance
                for s in rg.stock_list:
                    if s not in self.wash_sale_list:
                        if s in self.do_not_replace_list:
                            self.do_not_replace_list.pop(self.do_not_replace_list.index(s))
                        self.sell_stock(s, update_date, rg.stock_list[s], sell_all=True)
                rgs_to_delete.append(rg)
                # print("unreplacing "+rg.stock+" on "+str(update_date))
                self.buy_stock(rg.stock, update_date, (self.cash_balance-starting_balance)/get_stock_price(rg.stock, update_date, Time.OPEN), amap=True)
                # for s in self.stocks:
                #     print(s+": "+str(self.stocks[s].quantity))

        for rg in rgs_to_delete:
            self.replacement_groups.pop(self.replacement_groups.index(rg))



    def begin_day(self, date_to_begin):
        """Calculates the starting value of the day and updates the wash sale list
        """
        self.calculate_value(Time.OPEN, date_to_begin)
        self.update_wash_sale_list()
        self.update_replacement_groups(date_to_begin)

    def end_day(self, date_to_record):
        """"Calculates the day's closing value and adds it to a list
        :param date_to_record: The date associated with the ending day
        """
        self.calculate_value(Time.OPEN, date_to_record)
        self.price_history[date_to_record] = self.value
        self.closing_prices.append(self.value)

    def end_simulation(self, start_date_index, end_date_index):
        """Creates a Pandas Series of percent change of closing values and returns it
        :param start_date_index: The index of the day that the simulation started
        :param end_date_index: The index of the day that the simulation ended
        """
        self.returns = pd.Series(data=self.closing_prices, index=dates[start_date_index:end_date_index]).pct_change()
        return self.returns

    def does_own_stock(self, ticker) -> bool:
        """Reports whether this portfolio contains a given stock
        :param ticker: The stock to search for
        """
        return ticker in self.stocks.keys()

    def buy_stock(self, ticker, buy_date, quantity_to_buy, amap= False):
        """Buys stock using cash balance if possible
        :param amap: If not enoiugh cash to complete trade, buy as much as possible with current cash
        :param buy_date: The date on which to buy the stocks
        :param ticker: The stock to purchase
        :param quantity_to_buy: The quantity of the stock to purchase (can be a float)
        """

        stock_price = get_stock_price(ticker, buy_date, Time.OPEN)
        trade_basis = stock_price * quantity_to_buy  #TODO can add fees here

        # Exit if we cannot complete the trade
        if trade_basis > self.cash_balance:
            if amap:
                quantity_to_buy = self.cash_balance/stock_price
                trade_basis = stock_price * quantity_to_buy
            else:
                warnings.warn("Not enough cash to execute trade: buy " + str(quantity_to_buy) + " " + ticker)
                return

        if ticker in self.wash_sale_list:
            warnings.warn("Cannot buy " + ticker + ": wash sale violation")
            return

        # If we do not currently own the stock, create an entry
        if ticker not in list(self.stocks.keys()):
            self.stocks[ticker] = Stock(ticker, buy_date)

        # Execute the trade
        self.stocks[ticker].buy(quantity_to_buy, stock_price)
        self.cash_balance -= trade_basis
        # print("buying "+ticker+": "+str(self.value))

    def sell_stock(self, ticker, sell_date, sell_quantity=0, sell_all=False):
        """Sells stock and adds to cash balance
        :param sell_date: The date on which to sell the stocks
        :param ticker: The stock to sell
        :param sell_quantity: The quantity of the stock to sell (can be a float)
        :param sell_all: Optional parameter to sell all stock
        """
        stock_price = get_stock_price(ticker, sell_date, Time.OPEN)
        # Ensure we have enough stock to sell
        quantity_owned = self.stocks[ticker].quantity
        if (sell_quantity > quantity_owned) and not sell_all:
            # print(ticker+" trying to sell"+str(sell_quantity)+" but own"+str(quantity_owned))
            return

        if sell_all or sell_quantity == quantity_owned:
            self.stocks[ticker].sell_all()
            self.cash_balance += quantity_owned * stock_price
            self.stocks.pop(ticker)

            # Update the wash sale list
            self.wash_sale_list[ticker] = 0
            # print("selling "+ticker+": "+str(self.value))
            return

        # Do the trade
        self.stocks[ticker].sell(sell_quantity, stock_price)
        self.cash_balance += sell_quantity * stock_price
        self.wash_sale_list[ticker] = 0
        # print("selling "+ticker+": "+str(self.value))

    def identify_losers(self, loser_date):
        """Identifies stocks that have dropped in value more than 5% since they were bought
        :returns: List of all stocks that have dropped more than 5%
        """
        #TODO changed identify losers metric
        identified_losers = []
        for owned_stock in self.stocks.keys():
            #if self.stocks[owned_stock].get_change(loser_date) <= -1 * (0.05 * self.value):
            quantity = self.stocks[owned_stock].quantity

            if (self.stocks[owned_stock].avg_cost * quantity - get_stock_price(owned_stock, loser_date, Time.OPEN) * quantity) > 0.003 * self.value:
                if not owned_stock in self.do_not_replace_list:
                    identified_losers.append(owned_stock)
                #print(str(owned_stock)+", " + str(loser_date)+": "+str(self.stocks[owned_stock].avg_cost * quantity)+" - "+str(get_stock_price(owned_stock, loser_date, Time.OPEN) * quantity)+" > "+ str(0.005 * self.value))
        return identified_losers

    def calculate_diff(self, baseline):
        for m in range(0, len(self.closing_prices)):
            self.diff.append(self.closing_prices[m] - baseline[m])

def scale_portfolio_value(p, ratio, s_s_date):
    """Returns a new portfolio with quantities of each stock scaled by a specified ratio"""
    new_p = {}
    for s in p.stocks:
        new_p[s] = Stock(s, s_s_date, p.stocks[s].quantity*ratio)

    is_baseline = p.baseline
    is_synthetic = p.synthetic
    new_port = Portfolio(p.name, s_s_date, p.cash_balance, new_p, baseline=is_baseline, synthetic=is_synthetic)

    return new_port


def get_normalized_portfolios(p, s_s_date):
    """Accepts portfolios and returns new portfolios with the same value on the start date
    Limitations: All portfolios but the synthetic one can only contain one stock
    :param p: The portfolios to normalize (the baseline must be in position 1 and the synthetic portfolio must be marked)
    :param s_s_date: The start date on which to normalize the portfolios
    """
    return_ports = []

    for port in p:
        port_value = port.calculate_value_date(s_s_date)
        ratio = 100 / port_value

        new_port = scale_portfolio_value(port, ratio, s_s_date)

        return_ports.append(new_port)

    return return_ports


def plot_portfolio(p, sim_end_date, sim_start_date):
    #plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=round((sim_end_date - sim_start_date) / 10)))

    for port in p:
        days = []
        y_price = []
        tdays = []
        r =90
        for price in port.price_history:
            days.append(price)
            tdays.append(r)
            r+=1
            y_price.append(port.price_history[price])
        plt.plot(tdays, y_price, label=port.name)

    #plt.gcf().autofmt_xdate()
    plt.rcParams["figure.figsize"] = [16, 9]
    plt.legend(loc='best')
    # x1,x2,y1,y2 = plt.axis()
    # plt.axis((x1,x2,0,200))
    plt.show()


def plot_differences(p, sim_end_date, sim_start_date):
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=round((sim_end_date - sim_start_date) / 10)))

    for port in p:
        days = []
        for price in port.price_history:
            days.append(price)
        plt.plot(days, port.diff, label=port.name)
    plt.gcf().autofmt_xdate()
    plt.rcParams["figure.figsize"] = [16, 9]
    plt.legend(loc='best')
    plt.show()

def get_correlations(portfolios):
    correlations = {}
    for p in portfolios:
        if p.baseline:
            for po in portfolios:
                if not po.baseline:
                    correlations[p.name+"x"+po.name] = po.returns.corr(p.returns)
        return correlations


def print_correlations(portfolios):
    correlations = get_correlations(portfolios)
    for correlation in correlations.keys():
        print(correlation+": "+str(correlations[correlation]))


def generate_end_report(portfolios, sim_start_date, sim_end_date):
    plot_portfolio(portfolios, sim_end_date, sim_start_date)
    for p in portfolios:
        p.calculate_diff(portfolios[0].closing_prices)
    plot_differences(portfolios, sim_end_date, sim_start_date)
    # print_correlations(portfolios)

def get_stats(portfolios):
    correlations = get_correlations(portfolios)

    mse = {}
    for p in portfolios:
        p.calculate_diff(portfolios[0].closing_prices)
    for p in portfolios:
        squared_diff = [val**2 for val in p.diff]
        mse[p.name] = sum(squared_diff)/len(squared_diff)
        if not p.baseline and not p.synthetic:
            mse["CORR"] = sum(squared_diff)/len(squared_diff)

    mse_percent = {}
    baseline = portfolios[0].returns[1:].values
    for p in portfolios:
        pct_changes = p.returns[1:].values
        diff = [pct_changes[h]*100 - baseline[h]*100 for h in range(0, len(pct_changes))]
        squared_diff = [val**2 for val in diff]
        mse_percent[p.name] = sum(squared_diff)/len(squared_diff)

    return {"Correlation": correlations, "MSE": mse, "MSE_percent":mse_percent}

In [546]:
def model_output_to_portfolio(model_output, output_date):
    p = {}
    for a_stock in model_output.keys():
        try:
            p[a_stock] = Stock(a_stock, output_date, model_output[a_stock])
        except:
            print("Excluding: "+a_stock)
    return p


def run_simulation(sim_start_date, sim_end_date, portfolios, graph = True):
    # Run the simulation
    for date in range(sim_start_date, sim_end_date):
        # Allow the portfolio to perform start of day updates

        for sim_portfolio in portfolios:
            sim_portfolio.begin_day(date)
            # print(str(date)+" " +sim_portfolio.name+": "+str(sim_portfolio.value))

            if sim_portfolio.replacement:
                # Determine which stocks should be tax loss harvested
                losers = sim_portfolio.identify_losers(date)

                # print("losers "+str(date))
                # print(losers)
                # print("------")

                for losing_stock in losers:
                    # Get dictionary of replacement stocks {'Ticker': Quantity}

                    replacement_stocks = Portfolio("Temp", date, 0, model_output_to_portfolio(hybrid_learning_replace(losing_stock, date), date))
                    ratio = (get_stock_price(losing_stock, date, Time.OPEN)*sim_portfolio.stocks[losing_stock].quantity)/replacement_stocks.value
                    replacement_stocks = scale_portfolio_value(replacement_stocks, ratio, date)


                    can_do_replacement = True


                    for r_stock in replacement_stocks.stocks.keys():
                        if r_stock in sim_portfolio.wash_sale_list:
                            #print("Cannot replace "+ losing_stock+" on "+ str(date) + "due to wash violation")
                            can_do_replacement = False

                    if can_do_replacement:
                        # print("replacing "+losing_stock+" on "+str(date))
                        # Sell the losing stock
                        sim_portfolio.sell_stock(losing_stock, date, sell_all=True)

                        # Buy the replacement stocks
                        s_list = {}
                        for buy_stock in replacement_stocks.stocks.keys():
                            s_list[buy_stock] = replacement_stocks.stocks[buy_stock].quantity
                            sim_portfolio.do_not_replace_list.append(buy_stock)
                        rg = ReplacementGroup(s_list, losing_stock)

                        for buy_stock in replacement_stocks.stocks.keys():
                            sim_portfolio.buy_stock(buy_stock, date, replacement_stocks.stocks[buy_stock].quantity, amap=True)

                        sim_portfolio.replacement_groups.append(rg)
                        # for s in rg.stock_list:
                        #     print(s)

                        # for ostock in sim_portfolio.stocks.keys():
                        #     print(ostock+": "+str(sim_portfolio.stocks[ostock].quantity))
                        #
                        # print("************************")


            # Allow the portfolio to perform end-of-day updates
            sim_portfolio.end_day(all_dates[date])
        # print()
    returns = []

    # Inform the portfolio that the simulation has ended
    #print()
    for portfolio in portfolios:
        returns.append(portfolio.end_simulation(sim_start_date, sim_end_date))
        # print(portfolio.name+": "+str(portfolio.value))

    if graph:
        generate_end_report(portfolios, sim_start_date, sim_end_date)


def simulate_stock(stock, sim_start_date, sim_end_date, tra_end_date, train_num_days, num_principal_components,
                   num_pred_stocks, is_sparse, regression_only = False, graph = True):
    # Find correlated stock during training period
    correlated_stock_ticker = list(corr_list_for_single_stock(stock, tra_end_date, train_num_days).keys())[1]

    # Find synthetic stock portfolio during training period
    synth_stocks = model_output_to_portfolio(
        predict_portfolio(stock, num_principal_components, train_num_days, tra_end_date, num_pred_stocks, is_sparse, regression_only=regression_only),
        sim_start_date)

    synth_stock_portfolio = Portfolio("Synthetic", sim_start_date, 0, starting_stocks=synth_stocks, synthetic=True)
    correlated_stock_portfolio = Portfolio(correlated_stock_ticker, sim_start_date, 0, starting_stocks={
        correlated_stock_ticker: Stock(correlated_stock_ticker, sim_start_date, 1)})
    baseline_stock_portfolio = Portfolio(stock, sim_start_date, 0,
                                         starting_stocks={stock: Stock(stock, sim_start_date, 1)}, baseline=True)

    # ensure that the real stock is first in the portfolio
    portfolios = get_normalized_portfolios(
        [baseline_stock_portfolio, synth_stock_portfolio, correlated_stock_portfolio], sim_start_date)
    run_simulation(sim_start_date, sim_end_date, portfolios, graph)
    return portfolios

In [549]:
# startdate = 90
#
# #make sure you have 2 separate stock lists, otherwise pass by reference will cause both portfolios to perform the exact same
# snp_composition_stocks = model_output_to_portfolio(snp_composition, train_start_date+90)
# snp_composition_stocks2 = model_output_to_portfolio(snp_composition, train_start_date+90)
# sample = {"AAPL": Stock("AAPL", 90, 1), "COST": Stock("COST", 90, 1), "MXL": Stock("MXL", 90, 1), "WMT": Stock("WMT", 90, 1),
#           "KMI": Stock("KMI", 90, 1), "MTD": Stock("MTD", 90, 1), "MDLZ": Stock("MDLZ", 90, 1), "KR": Stock("KR", 90, 1),
#           "HES": Stock("HES", 90, 1), "ORLY": Stock("ORLY", 90, 1), "HLT": Stock("HLT", 90, 1), "HOLX": Stock("HOLX", 90, 1),
#           "HON": Stock("HON", 90, 1), "OXY": Stock("OXY", 90, 1), "OMC": Stock("OMC", 90, 1), "LHX": Stock("LHX", 90, 1)}
#
# sample2 = {"AAPL": Stock("AAPL", 90, 1), "COST": Stock("COST", 90, 1), "MXL": Stock("MXL", 90, 1), "WMT": Stock("WMT", 90, 1),
#           "KMI": Stock("KMI", 90, 1), "MTD": Stock("MTD", 90, 1), "MDLZ": Stock("MDLZ", 90, 1), "KR": Stock("KR", 90, 1),
#           "HES": Stock("HES", 90, 1), "ORLY": Stock("ORLY", 90, 1), "HLT": Stock("HLT", 90, 1), "HOLX": Stock("HOLX", 90, 1),
#           "HON": Stock("HON", 90, 1), "OXY": Stock("OXY", 90, 1), "OMC": Stock("OMC", 90, 1), "LHX": Stock("LHX", 90, 1)}
#
# snp = Portfolio("SNP", startdate, 0,snp_composition_stocks , baseline=True)
# rep = Portfolio("Replacer", startdate, 0, snp_composition_stocks2, replacement=True)
#
# run_simulation(startdate, 360, [snp, rep], True)


#when you replace a stock, group them somehow. disallow this group from being able to be replaced themselves, and add a way to sell them off and buy back the original after 30 days
#then, what do you do if another stock you already own is on this list, I guess you'd have to just sell some of it
#TODO address that you just need to sell some of this stock

In [551]:
# def simulate_stock(stock: {__eq__},
#                    sim_start_date: Any,
#                    sim_end_date: {__sub__},
#                    tr_end_date: {__sub__},
#                    train_num_days: Any,
#                    num_principal_components: Any,
#                    num_pred_stocks: Any,
#                    sparse: Any) -> None
#                    graph: bool = True) -> None
#simulate_stock('AAPL', test_start_date, test_start_date + 20, train_end_date, 120, 2, 10, False, graph = True)

i=0
hyperparameters = {}
low_mse = 10000000
corr_mse = 0
for n_principal_components in range(1, 4):
    for num_train_days in [0, 30, 60, 90, 150, 180, 360]:
        for n_pred_stocks in [2, 3, 5, 10, 15, 25, 50, 100]:
            if n_principal_components>=n_pred_stocks:
                continue
            for sparse in [True, False]:
                for reg_only in [True, False]:
                    mses = []
                    corr_mses = []
                    for r in range(0, 100): # repeats a simulation 100 times to get an average measure of the MSE for these hyperparameters over different stocks and time periods
                        # tr_end_date should within [train_start_date+num_train_days, train_end_date-num_validation_days] to ensure it does not leak into the test set, even after it is validated
                        tr_end_date = random.randint(train_start_date+num_train_days, train_end_date - num_validation_days)

                        # validation starts on the soonest day after training is done
                        validation_start_date = tr_end_date+1

                        num = random.randint(0, len(all_tickers)-1)
                        stock_to_sim = list(all_tickers)[num]

                        if num_train_days==0:
                            num_train_days = tr_end_date-1

                        stats = get_stats(simulate_stock(stock_to_sim, validation_start_date, validation_start_date+num_validation_days, tr_end_date, num_train_days, n_principal_components, n_pred_stocks, sparse, regression_only = reg_only, graph=False))
                        mses.append(stats['MSE']['Synthetic'])
                        corr_mses.append(stats['MSE']["CORR"])

                    print(i)
                    i+=1

                    avg_mse = sum(mses)/100
                    avg_corr_mse = sum(corr_mses)/100

                    if avg_mse<=low_mse:
                        low_mse = avg_mse
                        corr_mse = avg_corr_mse
                        hyperparameters = {'npca': n_principal_components, 'ntd': num_train_days, 'nps': n_pred_stocks, 'sparse': sparse, 'regression_only': reg_only}
print(low_mse)
print(corr_mse)
print(hyperparameters)

#0.0001232489645044427
#{'npca': 2, 'ntd': 30, 'nps': 3, 'sparse': True}
# simulate_stock('COST', test_start_date, test_start_date + 20, train_end_date, 30, 2, 3, True, graph = True)

# 0.010779939104229553
# {'npca': 4, 'ntd': 30, 'nps': 5, 'sparse': False}
# simulate_stock("COST", test_start_date, test_start_date+30, train_end_date, 30, 4, 5, True, graph=True)

# 46.56410051362909
# {'npca': 4, 'ntd': 30, 'nps': 25, 'sparse': False}

# 47.28842525584654
# {'npca': 7, 'ntd': 90, 'nps': 25, 'sparse': True}

# 36.120000133501776
# {'npca': 1, 'ntd': 150, 'nps': 15, 'sparse': False}


# 22.05078125720328
# {'npca': 3, 'ntd': 360, 'nps': 5, 'sparse': False}

# 29.38596396938674
# 53.247185633481195
# {'npca': 1, 'ntd': 90, 'nps': 15, 'sparse': False}

# num = random.randint(0, len(all_tickers)-1)
# stock_to_sim = list(all_tickers)[num]
# sportfolios = simulate_stock(stock_to_sim, test_start_date, test_start_date+30, train_end_date, 90, 2, 15, False, graph=True)
# print(get_stats(sportfolios))


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
48.443808196496505
63.3444940771782
{'npca': 1, 'ntd': 30, 'nps': 10, 'sparse': True}


In [None]:
#pf.create_returns_tear_sheet(sportfolios[1].returns)