In [4]:
import pandas as pd
from tqdm.autonotebook import tqdm
def load_data(ticker):
    file_path = f'./indicator_data/{ticker}.csv'
    df = pd.read_csv(file_path, index_col='date', parse_dates=True)
    return df

def evaluate_condition(df, indicator, direction, value, days):
    indicator_values = df[indicator]
    
    if direction == '>':
        condition = indicator_values > value
    else:
        condition = indicator_values < value
    
    # Change the rolling operation
    return condition.rolling(window=days).min() == 1

def evaluate_branch(branch):
    parts = branch.split('-')
    indicator, period, indicator_ticker, direction, threshold, days, trading_ticker = parts
    
    indicator_df = load_data(indicator_ticker)
    trading_df = load_data(trading_ticker)
    
    common_dates = indicator_df.index.intersection(trading_df.index)
    indicator_df = indicator_df.loc[common_dates]
    trading_df = trading_df.loc[common_dates]
    
    condition_met = evaluate_condition(indicator_df, f"{indicator}_{period}", direction, float(threshold), int(days))
    
    result = pd.DataFrame(index=trading_df.index)
    result['condition_met'] = condition_met.astype(int)
    
    # Shift condition_met forward by one day and calculate trade returns
    result['shifted_condition'] = result['condition_met'].shift(1)
    result['trade_returns_day'] = (result['shifted_condition'] * trading_df['close'].pct_change()) + 1
    
    # return result[['condition_met', 'trade_returns_day']].to_dict(orient='index')
    return result[['trade_returns_day']]


def compute_monthly_returns(branch, df):
    
    df['year'] = df.index.year
    df['month'] = df.index.month
    monthly_returns = df.groupby(['year', 'month'])['trade_returns_day'].prod()
    
    # set the branch as the index and each month-year as a column
    monthly_returns = monthly_returns.reset_index()
    monthly_returns['month-year'] = monthly_returns['year'].astype(str) + '-' + monthly_returns['month'].astype(str).str.zfill(2)
    monthly_returns['branch'] = branch
    monthly_returns = monthly_returns.pivot(index='month-year', columns='branch', values='trade_returns_day')
    return monthly_returns

def batch_processor(branches):
    monthly_returns_df = pd.DataFrame()
    for branch in branches:
        result = evaluate_branch(branch)
        monthly_returns = compute_monthly_returns(branch, result)
        monthly_returns_df = pd.concat([monthly_returns_df, monthly_returns], axis=1)
    return monthly_returns_df

branches = []
with open('branches.txt', 'r') as file:
    branches = [line.strip() for line in file]
    
batch_size = 100
for i in tqdm(range(0, len(branches), batch_size), desc="Processing branches"):
    batch = branches[i:i+batch_size]
    monthly_returns_df = batch_processor(batch)
    monthly_returns_df.to_parquet(f'./monthly_returns/monthly_returns_{i//batch_size}.parquet')

  from tqdm.autonotebook import tqdm


Processing branches:   0%|          | 0/1006 [00:00<?, ?it/s]

In [32]:
# extract results
import os
import pandas as pd
def gather_monthly_returns(directory):
    monthly_returns_df = pd.DataFrame()
    files = [f for f in os.listdir(directory) if f.endswith('.parquet')]
    for file in files:
        monthly_returns_df = pd.concat([monthly_returns_df, pd.read_parquet(os.path.join(directory, file))], axis=1)
    return monthly_returns_df

def one_back_one_forward(monthly_returns_df, back_start, back_end, forward_start, forward_end, top_n):
    # expect string in yyyy-mm format and convert if it is not already datetime
    if isinstance(back_start, str)  :
        back_start = pd.to_datetime(back_start)
    if isinstance(back_end, str):
        back_end = pd.to_datetime(back_end)
    if isinstance(forward_start, str):
        forward_start = pd.to_datetime(forward_start)
    if isinstance(forward_end, str):
        forward_end = pd.to_datetime(forward_end)
    
    back_returns = monthly_returns_df.loc[back_start:back_end]
    forward_returns = monthly_returns_df.loc[forward_start:forward_end]
    
    # do the cumprod of each branch in the back period and then choose the top n branches
    back_period_returns = back_returns.cumprod()
    top_n_branches = back_period_returns.iloc[-1].nlargest(top_n)
    print(top_n_branches)
    
    branches_to_check = top_n_branches.index.tolist()
    print(branches_to_check)
    
    forward_period_returns = forward_returns.cumprod()
    forward_period_returns = forward_period_returns.iloc[-1].loc[branches_to_check]
    combined_returns = forward_period_returns.mean()
    
    # combine the back period returns and the forward period returns with the branch as index 
    dataset = pd.concat([top_n_branches, forward_period_returns], axis=1)
    dataset.columns = ['back_period_returns', 'forward_period_returns']
    # add a row for the averages
    dataset.loc['avg'] = ((dataset.mean()-1) * 100)
    dataset.loc['portion_of_year'] = [(back_end - back_start).days / 365, (forward_end - forward_start).days / 365]
    dataset.loc['cagr'] = ((dataset.mean() - 1)*100) / dataset.loc['portion_of_year']
    print(dataset)
    
    # get those brnaches for the forward period and multiply the returns
    # average the returns for the forward period
   
    
    # multiply the top n branches returns with the forward period returns
    # combined_returns = top_n_branches.mul(forward_returns)
    
    return dataset

monthly_returns_df = gather_monthly_returns('./monthly_returns')
# convert the index to datetime
monthly_returns_df.index = pd.to_datetime(monthly_returns_df.index)
# print(monthly_returns_df.head(5))

back_start = "2010-02"
back_end = "2010-12"
forward_start = "2011-01"
forward_end = "2011-03"
top_n = 100

dataset = one_back_one_forward(monthly_returns_df, back_start, back_end, forward_start, forward_end, top_n)

# save the dataset to a csv file
dataset.to_csv('./dataset.csv')

branch
rsi-3-SPXL-<-78-1-SPXL    7.233796
rsi-3-SPXL-<-79-1-SPXL    7.233796
rsi-3-SPXL-<-80-1-SPXL    7.233796
rsi-3-SPXL-<-81-1-SPXL    7.233796
rsi-3-SPXL-<-82-1-SPXL    7.233796
                            ...   
rsi-3-SPXL-<-24-1-SPXL    4.340278
rsi-3-SPXL-<-25-1-SPXL    4.340278
rsi-3-SPXL-<-12-1-SPXL    4.340278
rsi-3-SPXL-<-13-1-SPXL    4.340278
rsi-3-SPXL-<-14-1-SPXL    4.340278
Name: 2010-12-01 00:00:00, Length: 100, dtype: float64
['rsi-3-SPXL-<-78-1-SPXL', 'rsi-3-SPXL-<-79-1-SPXL', 'rsi-3-SPXL-<-80-1-SPXL', 'rsi-3-SPXL-<-81-1-SPXL', 'rsi-3-SPXL-<-82-1-SPXL', 'rsi-3-SPXL-<-83-1-SPXL', 'rsi-3-SPXL-<-84-1-SPXL', 'rsi-2-SPXL-<-16-1-SPXL', 'rsi-2-SPXL-<-17-1-SPXL', 'rsi-2-SPXL-<-18-1-SPXL', 'rsi-2-SPXL-<-19-1-SPXL', 'rsi-2-SPXL-<-20-1-SPXL', 'rsi-2-SPXL-<-21-1-SPXL', 'rsi-2-SPXL-<-22-1-SPXL', 'rsi-2-SPXL-<-23-1-SPXL', 'rsi-2-SPXL-<-24-1-SPXL', 'rsi-2-SPXL-<-25-1-SPXL', 'rsi-2-SPXL-<-26-1-SPXL', 'rsi-2-SPXL-<-27-1-SPXL', 'rsi-2-SPXL-<-28-1-SPXL', 'rsi-2-SPXL-<-29-1-SPXL', 'rsi-3