# Import Libraries, Load Data

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.statespace.sarimax import SARIMAX
import seaborn as sns
import yfinance as yf
from numpy import linalg as LA
from sklearn.cluster import KMeans
from statsmodels.tsa.stattools import coint
from itertools import combinations
import matplotlib.dates as mdates
from datetime import datetime

In [3]:
# Paths:
# price_path = '/Users/tuckeringlefield/Desktop/FinanceData/price_data_from_shardar.csv'
# cap_path = '/Users/tuckeringlefield/Desktop/FinanceData/cap_data_from_shardar.csv'
price_path = "C:/Users/lukee/Downloads/price_data_from_shardar.csv"
cap_path = "C:/Users/lukee/Downloads/cap_data_from_shardar.csv"

# Reading Data:
prices_df = pd.read_csv(price_path, index_col='date')
caps_df = pd.read_csv(cap_path, index_col='date')

# Drop troubled stock...
prices_df.drop(["MGI", "MDLZ", "DWA", "ICE"], axis=1, inplace=True)
caps_df.drop(["MGI", "MDLZ", "DWA", "ICE"], axis=1, inplace=True)

In [59]:
# Convert dates to datetime
prices_df.index = pd.to_datetime(prices_df.index)

# Get the initial start and end date
start_date = prices_df.index[0]
end_date = prices_df.index[-1]

# Download additional data:
spy_data = yf.download('SPY', start=start_date, end=end_date, interval='1d')
spy_data = pd.DataFrame(spy_data["Adj Close"])
spy_data.rename({"Adj Close": 'SPY'}, inplace=True, axis=1)

# Set up dataframes:
prices_with_market = prices_df.merge(spy_data, how='left', left_index=True, right_index=True)
# df_diff = prices_with_market.diff().dropna()

[*********************100%%**********************]  1 of 1 completed


# Function Definitions

In [5]:
# Function to find the top ten largest market cap stocks
def find_top_liquid_stocks(dataframe, date_start, date_end, num_stocks):
    target_date = date_end + 1
    selected_row = caps_df.iloc[target_date]
    selected_row_no_null = selected_row.dropna()
    stocks_list = selected_row_no_null.nlargest(num_stocks).index.tolist()
    return stocks_list

In [6]:
# Function to filter the DF
def filter_diff_df(dataframe, date_start, date_end, stocks_list):
    desired_columns = stocks_list.copy()
    desired_columns.append("SPY")
    filtered_df = dataframe[desired_columns]
    filtered_df = filtered_df[date_start:date_end].diff()#.dropna()
    filtered_df = filtered_df.dropna()
    return filtered_df

In [7]:
def filter_df_by_dates(dataframe, date_start, date_end, stocks_list):
    desired_columns = stocks_list.copy()
    desired_columns.append("SPY")
    filtered_df = dataframe[desired_columns]
    filtered_df = filtered_df[date_start:date_end]
    return filtered_df

In [8]:
# Function to calculate the residuals
def calculate_residuals(df, stocks_list):
    res_df = pd.DataFrame()
    for stk in stocks_list:
        res_df[stk] = df[stk]-df[stk+"_beta"]*df["SPY"]
    return res_df

In [9]:
# Function to cluster the matrix
def cluster_the_matrix(df, num_clusters):
    A = abs(df.corr().values)
    D = np.diag(A.sum(axis=1))
    L = D - A
    eigenvalues, eigenvectors = LA.eig(L)
    X = eigenvectors[:,:num_clusters]
    kmeans = KMeans(n_clusters=num_clusters, random_state=2, n_init=20).fit(X)

    cluster_dict = {}

    # Iterate over the indices of cluster_list
    for i in range(len(kmeans.labels_)):
        cluster_number = kmeans.labels_[i]
        stock_name = df.columns[i]

        # Check if cluster_number is already a key in the dictionary
        if cluster_number in cluster_dict:
            cluster_dict[cluster_number].append(stock_name)
        else:
            cluster_dict[cluster_number] = [stock_name]

    # Cluster diagram:
    # fig, ax = plt.subplots(1, 1, figsize=(8, 4))
    # scatter = ax.scatter(X[:, 0], X[:, 1], c=kmeans.labels_)
    # unique_labels = {label: idx for idx, label in enumerate(set(kmeans.labels_))}
    # handles = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=scatter.cmap(scatter.norm(value)), markersize=10)
    #        for value in unique_labels.values()]
    # labels = unique_labels.keys()
    # ax.legend(handles, labels, title="Clusters", loc="best", bbox_to_anchor=(1, 1))
    # ax.set_title(f'K-Means Clustering Results with K={num_clusters}')
    # plt.show()

    return cluster_dict


In [10]:
# Function to find the cointegrated pairs
def find_cointegrated_pairs(dataframe, cluster_dict, sig_level):
    cointegrated_pairs = []
    for cluster_num, stocks in cluster_dict.items():
      for stock1, stock2 in combinations(stocks, 2):
          pvalue1 = coint(dataframe[stock1], dataframe[stock2])[1]
          pvalue2 = coint(dataframe[stock2], dataframe[stock1])[1]
          if pvalue1 < sig_level and pvalue2 < sig_level:
              cointegrated_pairs.append((stock1, stock2))
    return cointegrated_pairs

In [11]:
# Function to check on existing pairs
def is_still_conintegrated(dataframe, pair, sig_level):
    stock1 = pair[0]
    stock2 = pair[1]
    pvalue1 = coint(dataframe[stock1], dataframe[stock2])[1]
    pvalue2 = coint(dataframe[stock2], dataframe[stock1])[1]
    if pvalue1 < sig_level and pvalue2 < sig_level:
        return True
    return False

In [12]:
# weekly function to calculate the beta of the pair
def calculate_beta_for_pair(dataframe, pair):
    asst1 = pair[0]
    asst2 = pair[1]

    train = dataframe[[asst1, asst2]]

    beta = train.cov().iloc[0, 1]/train[asst2].var()
    return beta

In [75]:
# Function to calculate the betas
def calculate_betas(dataframe, date_start, date_end, stocks_list):
    # df = dataframe[date_start:date_end]
    print("Before", len(dataframe[stocks_list]))
    beta_values = []
    columns = []

    df_var = dataframe['SPY'].var()

    for stk in stocks_list:
        df_cov = dataframe[[stk, 'SPY']].cov().loc[stk, 'SPY']
        beta = df_cov / df_var
        beta_values.append(beta)
        columns.append(stk + '_beta')
        
    print('inside calculate betas')
    print("After", len(dataframe[stocks_list]))
    # print(dataframe[stocks_list])

    beta_df = pd.DataFrame([beta_values], columns=columns)
    beta_df.index = dataframe.index[:1]

    # plt.figure(figsize=(12, 4))
    # sns.boxplot(data=beta_df)
    # plt.show()

    return beta_df

In [14]:
# Function to get the spread data
def get_spread_limits_for_past_months(dataframe, pair, beta):
    asst1 = pair[0]
    asst2 = pair[1]
    asst1_mean = dataframe[asst1].mean() 
    asst2_mean = dataframe[asst2].mean() 
    spread_data = None
    order = []
    if asst1_mean > asst2_mean:
        spread_data = dataframe[asst1]-beta*dataframe[asst2]
        order = [asst1, asst2]
    else:
        spread_data = dataframe[asst2]-beta*dataframe[asst1]
        order = [asst2, asst1]
    mean = spread_data.mean()
    std_dev = spread_data.std()
    lower_limit = mean - (2*std_dev)
    upper_limit = mean + (2*std_dev)
    
    return upper_limit, lower_limit, order, mean

In [15]:
def print_spread_charts(data_series, upper_bound, lower_bound, mean_value, pair_name):
    # Extract dates and values
    dates = data_series.index
    values = data_series.values
    
    # Plot the data
    plt.figure(figsize=(10, 6))
    
    # Plot the average line
    plt.axhline(y=mean_value, color='blue', linestyle='-', linewidth=1, label='Mean for Past 3 months')
    
    # Plot the dashed lines for ±2 standard deviations
    plt.axhline(y=upper_bound, color='red', linestyle='--', linewidth=1, label='+2 Std Dev')
    plt.axhline(y=lower_bound, color='red', linestyle='--', linewidth=1, label='-2 Std Dev')
    
    # Plot the individual data points
    plt.scatter(dates, values, color='black', zorder=5)
    
    # Annotate the plot
    plt.title(f'Spread data for: {pair_name} for monitoring week')
    plt.xlabel('Date')
    plt.ylabel('Value')
    plt.legend()
    
    # Display the plot
    plt.grid(True)
    plt.show()

In [64]:
def monitor_pair_for_week(pair, pair_dict, start_date_index, end_date_index, curr_week_start_index, currently_trading_stocks):
    global prices_with_market
    # Get data
    current_week_price_data = prices_with_market[curr_week_start_index : end_date_index] # 7 days of data
    past_three_month_data = prices_with_market[start_date_index : curr_week_start_index]
    
    beta_past_three_months = calculate_beta_for_pair(past_three_month_data, pair)
    upper, lower , pair_order, mean_val = get_spread_limits_for_past_months(past_three_month_data, pair, beta_past_three_months)
    curr_week_spread_data = current_week_price_data[pair_order[0]]-beta_past_three_months*current_week_price_data[pair_order[1]]
    have_days_to_trade = False
    for index, dayValue in zip(curr_week_spread_data.index, curr_week_spread_data.values):
        if len(pair_dict["open_trade_dates"]) > len(pair_dict["close_trade_dates"]):
            if (pair_dict["trade_day_spread_position"] == "upper" and curr_week_spread_data[index] <= mean_val) or (pair_dict["trade_day_spread_position"] == "lower" and curr_week_spread_data[index] >= mean_val):
                # Found a closing day
                date = index.strftime('%Y-%m-%d')
                pair_dict["close_trade_dates"].append(date)
                pair_dict["trade_status_message"] = "Looking for trading days"
                currently_trading_stocks.remove(pair[0])
                currently_trading_stocks.remove(pair[1])
                pair_dict["trade_day_spread_position"] = "unknown"
                have_days_to_trade = True
        else:
            if curr_week_spread_data[index] >= upper:
                date = index.strftime('%Y-%m-%d')
                pair_dict["open_trade_dates"].append(date)
                pair_dict["trade_status_message"] = "Currently in trade"
                pair_dict["trade_day_spread_position"] = "upper"
                currently_trading_stocks.append(pair[0])
                currently_trading_stocks.append(pair[1])
                have_days_to_trade = True
            elif curr_week_spread_data[index] <= lower:
                date = index.strftime('%Y-%m-%d')
                pair_dict["open_trade_dates"].append(date)
                pair_dict["trade_status_message"] = "Currently in trade"
                pair_dict["trade_day_spread_position"] = "lower"
                currently_trading_stocks.append(pair[0])
                currently_trading_stocks.append(pair[1])
                have_days_to_trade = True        
                
    return pair_dict, currently_trading_stocks
    # if have_days_to_trade:
    #     print_spread_charts(curr_week_spread_data, upper, lower, mean_val, pair)


In [84]:
def monitor_group_of_pairs(stock_group_size, stop_after_weeks):
    global prices_with_market
    # List of currently trading stocks
    currently_trading_stocks = []
    existing_stocks = {}
    # variable to track the weeks running
    weeks_running = 0
    while (weeks_running < stop_after_weeks):
        print(f"---------------------------- WEEKS RUNNING {weeks_running + 1} -------------------------------------")
        # Calculate index
        start_date_index = ((weeks_running)*7)
        end_date_index = ((weeks_running)*7) + 96
        curr_week_start_index = ((weeks_running)*7) + 89
        # Get data 
        three_month_plus_one_week_df = prices_with_market[start_date_index : end_date_index]
        forward_three_months_data = prices_with_market[start_date_index+6 : end_date_index]
        print(len(forward_three_months_data))
        # Monitor every stock pair that should be monitored in existing_stocks
        for stock_pair in existing_stocks:
            message = existing_stocks[stock_pair]["trade_status_message"]
            print(f"{stock_pair}: {message}")
            if existing_stocks[stock_pair]['should_monitor']:
                updated_vals, new_currently_trading_stock = monitor_pair_for_week(stock_pair, existing_stocks[stock_pair], start_date_index, end_date_index, curr_week_start_index, currently_trading_stocks)
                existing_stocks[stock_pair] = updated_vals
                currently_trading_stocks = new_currently_trading_stock
                  
        # Setup for next week
        highly_liquid_stocks = find_top_liquid_stocks(prices_with_market, (start_date_index+7) , (end_date_index), stock_group_size)
        liquid_stocks_not_null = []
        for stock in highly_liquid_stocks:
            num_nulls_for_stock = forward_three_months_data[[stock]].isnull().sum().item()
            if num_nulls_for_stock == 0:
                liquid_stocks_not_null.append(stock)

        # Grab day string to start or stop trades
        curr_week_date_string = prices_with_market.index[curr_week_start_index].strftime('%Y-%m-%d')
        end_date_string = prices_with_market.index[end_date_index].strftime('%Y-%m-%d')
        
        # Set up for inserting start and end dates for existing stock pairs
        for stock_pair in existing_stocks:
            # Both stocks must be highly liquid to trade
            if stock_pair[0] in highly_liquid_stocks and stock_pair[1] in highly_liquid_stocks:
                # Check for nulls in data for stock pair
                num_nulls_first_stock = forward_three_months_data[[stock_pair[0]]].isnull().sum().item()
                num_nulls_second_stock = forward_three_months_data[[stock_pair[1]]].isnull().sum().item()
                if num_nulls_first_stock > 0 or num_nulls_second_stock > 0:
                    if stock_pair[0] in currently_trading_stocks and stock_pair[1] in currently_trading_stocks:
                        # Close trades
                        existing_stocks[stock_pair]["cointegration_end_dates"].append(end_date_string)
                        existing_stocks[stock_pair]["close_trade_dates"].append(end_date_string)
                        existing_stocks[stock_pair]["trade_day_spread_position"] = "unknown"
                        # Remove stocks from being traded
                        currently_trading_stocks.remove(stock_pair[0])
                        currently_trading_stocks.remove(stock_pair[1])
                    else:
                        # Check if stocks are in current monitoring period
                        if len(existing_stocks[stock_pair]["cointegration_start_dates"]) > len(existing_stocks[stock_pair]["cointegration_end_dates"]):
                            # Add end date to monitoring perid
                            existing_stocks[stock_pair]["cointegration_end_dates"].append(end_date_string)
                    # Stocks not liquid enough to trade        
                    existing_stocks[stock_pair]["should_monitor"] = False
                    existing_stocks[stock_pair]["trade_status_message"] = "Nulls in one or more stock price data"
                else:
                    stock_pair_is_coint = is_still_conintegrated(forward_three_months_data, stock_pair, 0.05)
                    if stock_pair[0] in currently_trading_stocks and stock_pair[1] in currently_trading_stocks:
                        if stock_pair_is_coint:
                            existing_stocks[stock_pair]["should_monitor"] = True
                            existing_stocks[stock_pair]["is_cointegrated"] = True
                            existing_stocks[stock_pair]["trade_status_message"] = "Currently in trade"
                        else:
                            # Close trades
                            existing_stocks[stock_pair]["cointegration_end_dates"].append(end_date_string)
                            existing_stocks[stock_pair]["close_trade_dates"].append(end_date_string)
                            existing_stocks[stock_pair]["trade_day_spread_position"] = "unknown"
                            # Remove stocks from being traded
                            currently_trading_stocks.remove(stock_pair[0])
                            currently_trading_stocks.remove(stock_pair[1])
                            existing_stocks[stock_pair]["should_monitor"] = False
                            existing_stocks[stock_pair]["is_cointegrated"] = False
                            existing_stocks[stock_pair]["trade_status_message"] = "Stocks not currently cointegrated"
                    else:
                        if stock_pair_is_coint:
                            if len(existing_stocks[stock_pair]["cointegration_start_dates"]) > len(existing_stocks[stock_pair]["cointegration_end_dates"]):
                                existing_stocks[stock_pair]["should_monitor"] = True
                            else:
                                existing_stocks[stock_pair]["cointegration_start_dates"].append(end_date_string)
                                existing_stocks[stock_pair]["should_monitor"] = True
                                
                            existing_stocks[stock_pair]["trade_status_message"] = "Looking for trading days"
                        else:
                            if len(existing_stocks[stock_pair]["cointegration_start_dates"]) > len(existing_stocks[stock_pair]["cointegration_end_dates"]):
                                existing_stocks[stock_pair]["cointegration_end_dates"].append(end_date_string)
                            
                            existing_stocks[stock_pair]["should_monitor"] = False
                            existing_stocks[stock_pair]["trade_status_message"] = "Stocks not currently cointegrated"
            else:
                # Check if both stocks in pair are currently being traded
                if stock_pair[0] in currently_trading_stocks and stock_pair[1] in currently_trading_stocks:
                    # Close trades
                    existing_stocks[stock_pair]["cointegration_end_dates"].append(end_date_string)
                    existing_stocks[stock_pair]["close_trade_dates"].append(end_date_string)
                    existing_stocks[stock_pair]["trade_day_spread_position"] = "unknown"
                    # Remove stocks from being traded
                    currently_trading_stocks.remove(stock_pair[0])
                    currently_trading_stocks.remove(stock_pair[1])
                else:
                    # Check if stocks are in current monitoring period
                    if len(existing_stocks[stock_pair]["cointegration_start_dates"]) > len(existing_stocks[stock_pair]["cointegration_end_dates"]):
                        # Add end date to monitoring perid
                        existing_stocks[stock_pair]["cointegration_end_dates"].append(end_date_string)
                        
                # Stocks not liquid enough to trade        
                existing_stocks[stock_pair]["should_monitor"] = False
                existing_stocks[stock_pair]["trade_status_message"] = "Stocks not liquid enough to trade"
                
        # Find new pairs for week
        
        diff_df = filter_diff_df(prices_with_market, (start_date_index+7) , (end_date_index), liquid_stocks_not_null)
        # print("------------Diff df----------")
        # print(diff_df.isnull().sum())
        three_month_highly_liquid_stocks_price_df = filter_df_by_dates(prices_with_market, (start_date_index+6) , end_date_index, liquid_stocks_not_null)
        # print("-------diff df ------")
        # print(diff_df)
        beta_df = calculate_betas(diff_df, (start_date_index+6) , end_date_index, liquid_stocks_not_null)
        # print("-----------Beta df ------------")
        # print(beta_df.isnull().sum())

        merged_df = diff_df.merge(beta_df, how = 'cross')
        res_df = calculate_residuals(merged_df, liquid_stocks_not_null)
        cluster_dict = cluster_the_matrix(res_df, 5)
        new_stock_pairs = find_cointegrated_pairs(three_month_highly_liquid_stocks_price_df, cluster_dict, 0.05)
        
        for stock_pair in new_stock_pairs:
            if stock_pair not in existing_stocks:
                new_stock_pair_dict = {
                                        "is_cointegrated" : True,
                                        "should_monitor" : True,
                                        "trade_status_message" : "Open to trade",
                                        "cointegration_start_dates" : [],
                                        "cointegration_end_dates" : [],
                                        "open_trade_dates" : [],
                                        "close_trade_dates" : [],
                                        "trade_day_spread_position": "unknown"
                                    }
                if stock_pair[0] in currently_trading_stocks or stock_pair[1] in currently_trading_stocks:
                    new_stock_pair_dict["should_monitor"] = False
                    new_stock_pair_dict["trade_status_message"] = "One or both stocks are in another current trade"
                
                new_stock_pair_dict["cointegration_start_dates"].append(end_date_string)
                existing_stocks[stock_pair] = new_stock_pair_dict
        # Increment weeks_running
        weeks_running +=1
        
    return existing_stocks
        

In [None]:
# FIXME!!
def print_overall_display(pairs_dictionary):
    data = pairs_dictionary

    # Function to convert date strings to datetime objects
    def str_to_date(date_str):
        return datetime.strptime(date_str, '%Y-%m-%d')

    # Determine the number of plots needed
    num_keys = len(data)
    num_plots = (num_keys + 4) // 5  # Calculate the number of plots needed, rounding up

    # Iterate over the required number of plots
    for plot_index in range(num_plots):
        fig, ax = plt.subplots(figsize=(10, 5))

        start_index = plot_index * 5
        end_index = min(start_index + 5, num_keys)

        # Iterate over the data to plot
        for idx, (key, value) in enumerate(list(data.items())[start_index:end_index]):
            actual_idx = start_index + idx
            label = f"{key[0]}-{key[1]}"
        
            # Extract periods and events
            periods_start = [str_to_date(date) for date in value[2]]
            periods_end = [str_to_date(date) for date in value[3]]
            event_lists = value[3]
        
            # Plot each period as a line
            for start, end in zip(periods_start, periods_end):
                ax.plot([start, end], [idx, idx], color='black')
        
            # Plot each event as a dot
            for events in event_lists:
                event_dates = [str_to_date(date) for date in events]
                ax.plot(event_dates, [idx] * len(event_dates), 'o', color='red')

            # Add the label for the pair on the Y-axis
            #ax.text(periods_start[0], idx, label, verticalalignment='center', fontsize=12, horizontalalignment='right')

        # Formatting the plot
        ax.set_yticks(range(5))
        ax.set_yticklabels([f"{key[1]}-{key[2]}" for key, _ in list(data.items())[start_index:end_index]], fontsize=12, horizontalalignment='right')
        ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
        plt.xticks(rotation=45) 
        plt.xlabel('Date')
        plt.title(f'Event Timelines (Plot {plot_index + 1})')
        plt.tight_layout()
        plt.show()


# Running Functions

In [None]:
coint_dict = {
    ("INTC", "ORCL") : {
        "still_consecutive" : True,
        "should_monitor" : True,
        "Can_Trade" : True,
        "trade_status_message" : "Open to trade",
        "cointegration_start_dates" : [],
        "cointegration_end_dates" : [],
        "open_trade_dates" : [],
        "close_trade_dates" : []
    }
}

In [85]:
# results = monitor_group_of_pairs(coint_dict, 5)
import pprint
results = monitor_group_of_pairs(20, 5)
pprint.pprint(results)

---------------------------- WEEKS RUNNING 1 -------------------------------------
90
Before 88
inside calculate betas
After 88
---------------------------- WEEKS RUNNING 2 -------------------------------------
90
('SCMR', 'VIAV'): Open to trade
('ORCL', 'INTC'): Open to trade
('ORCL', 'AMAT'): Open to trade
('CIEN', 'SDLI'): Open to trade
('CIEN', 'SEBL'): Open to trade
('SDLI', 'SEBL'): Open to trade
('PMCS', 'BRCD'): Open to trade
Before 88
inside calculate betas
After 88
---------------------------- WEEKS RUNNING 3 -------------------------------------
90
('SCMR', 'VIAV'): Looking for trading days
('ORCL', 'INTC'): Looking for trading days
('ORCL', 'AMAT'): Stocks not currently cointegrated
('CIEN', 'SDLI'): Stocks not currently cointegrated
('CIEN', 'SEBL'): Stocks not liquid enough to trade
('SDLI', 'SEBL'): Stocks not liquid enough to trade
('PMCS', 'BRCD'): Stocks not liquid enough to trade
('INTC', 'ORCL'): Open to trade
Before 88
inside calculate betas
After 88
--------------