# Import Libraries, Load Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.statespace.sarimax import SARIMAX
import seaborn as sns
import yfinance as yf
from numpy import linalg as LA
from sklearn.cluster import KMeans
from statsmodels.tsa.stattools import coint
from itertools import combinations
import matplotlib.dates as mdates
from datetime import datetime

In [2]:
# Paths:
price_path = '/Users/tuckeringlefield/Desktop/FinanceData/price_data_from_shardar.csv'
cap_path = '/Users/tuckeringlefield/Desktop/FinanceData/cap_data_from_shardar.csv'
#price_path = "C:/Users/lukee/Downloads/price_data_from_shardar.csv"
#cap_path = "C:/Users/lukee/Downloads/cap_data_from_shardar.csv"

# Reading Data:
prices_df = pd.read_csv(price_path, index_col='date')
caps_df = pd.read_csv(cap_path, index_col='date')

# Drop troubled stock...
prices_df.drop(["MGI", "MDLZ", "DWA", "ICE"], axis=1, inplace=True)
caps_df.drop(["MGI", "MDLZ", "DWA", "ICE"], axis=1, inplace=True)

In [5]:
# Convert dates to datetime
prices_df.index = pd.to_datetime(prices_df.index)

# Get the initial start and end date
start_date = prices_df.index[0]
end_date = prices_df.index[-1]

# Download additional data:
spy_data = yf.download('SPY', start=start_date, end=end_date, interval='1d')
spy_data = pd.DataFrame(spy_data["Adj Close"])
spy_data.rename({"Adj Close": 'SPY'}, inplace=True, axis=1)

# Set up dataframes:
prices_with_market = prices_df.merge(spy_data, how='left', left_index=True, right_index=True)
df_diff = prices_with_market.diff().dropna()

[*********************100%%**********************]  1 of 1 completed


# Function Definitions

In [6]:
# Function to find the top ten largest market cap stocks
def find_top_stocks(dataframe, date_start, date_end, num_stocks):
    target_date = date_end + 1
    selected_row = caps_df.iloc[target_date]
    selected_row_no_null = selected_row.dropna()
    stocks_list = selected_row_no_null.nlargest(num_stocks).index.tolist()
    return stocks_list

In [7]:
# Function to filter the DF
def filter_diff_df(dataframe, date_start, date_end, stocks_list):
    desired_columns = stocks_list.copy()
    desired_columns.append("SPY")
    #print(desired_columns)
    filtered_df = dataframe[desired_columns]
    filtered_df = filtered_df[date_start:date_end].diff()#.dropna()
    # print("Filtered diff df len (before dropna):", len(filtered_df))
    # print(filtered_df.head())
    # print(len(filtered_df))
    # print(filtered_df.isnull().sum())
    filtered_df = filtered_df.dropna()
    # print("Filtered diff df len (after dropna):", len(filtered_df))
    # print(len(filtered_df))
    return filtered_df

In [8]:
def filter_df_by_dates(dataframe, date_start, date_end, stocks_list):
    desired_columns = stocks_list.copy()
    desired_columns.append("SPY")
    filtered_df = dataframe[desired_columns]
    filtered_df = filtered_df[date_start:date_end]
    return filtered_df

In [9]:
# Function to calculate the residuals
def calculate_residuals(df, stocks_list):
    res_df = pd.DataFrame()
    for stk in stocks_list:
        res_df[stk] = df[stk]-df[stk+"_beta"]*df["SPY"]
    return res_df

In [10]:
# Function to cluster the matrix
def cluster_the_matrix(df, num_clusters):
    A = abs(df.corr().values)
    #print(f'A Shape: {A.shape}')
    D = np.diag(A.sum(axis=1))
    #print(f'D Shape: {D.shape}')
    L = D - A
    #print(f'L Shape: {L.shape}')
    eigenvalues, eigenvectors = LA.eig(L)
    X = eigenvectors[:,:num_clusters]
    #print(f'X Shape: {X.shape}')
    kmeans = KMeans(n_clusters=num_clusters, random_state=2, n_init=20).fit(X)
    #print("Kmeans Labels:")
    #print(kmeans.labels_)
    #print(df.columns)

    cluster_dict = {}

    # Iterate over the indices of cluster_list
    for i in range(len(kmeans.labels_)):
        cluster_number = kmeans.labels_[i]
        stock_name = df.columns[i]

        # Check if cluster_number is already a key in the dictionary
        if cluster_number in cluster_dict:
            cluster_dict[cluster_number].append(stock_name)
        else:
            cluster_dict[cluster_number] = [stock_name]

    # Cluster diagram:
    # fig, ax = plt.subplots(1, 1, figsize=(8, 4))
    # scatter = ax.scatter(X[:, 0], X[:, 1], c=kmeans.labels_)
    # unique_labels = {label: idx for idx, label in enumerate(set(kmeans.labels_))}
    # handles = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=scatter.cmap(scatter.norm(value)), markersize=10)
    #        for value in unique_labels.values()]
    # labels = unique_labels.keys()
    # ax.legend(handles, labels, title="Clusters", loc="best", bbox_to_anchor=(1, 1))
    # ax.set_title(f'K-Means Clustering Results with K={num_clusters}')
    # plt.show()

    return cluster_dict


In [11]:
# Function to find the cointegrated pairs
def find_cointegrated_pairs(dataframe, cluster_dict, sig_level):
    cointegrated_pairs = []
    for cluster_num, stocks in cluster_dict.items():
      for stock1, stock2 in combinations(stocks, 2):
          pvalue1 = coint(dataframe[stock1], dataframe[stock2])[1]
          pvalue2 = coint(dataframe[stock2], dataframe[stock1])[1]
          if pvalue1 < sig_level and pvalue2 < sig_level:
              cointegrated_pairs.append((stock1, stock2))
    return cointegrated_pairs

In [12]:
# Function to check on existing pairs
def is_still_conintegrated(dataframe, pair, sig_level):
    stock1 = pair[0]
    stock2 = pair[1]
    pvalue1 = coint(dataframe[stock1], dataframe[stock2])[1]
    pvalue2 = coint(dataframe[stock2], dataframe[stock1])[1]
    if pvalue1 < sig_level and pvalue2 < sig_level:
        return True
    return False

In [13]:
# weekly function to calculate the beta of the pair
def calculate_beta_for_pair(dataframe, pair):
    asst1 = pair[0]
    asst2 = pair[1]

    train = dataframe[[asst1, asst2]]

    beta = train.cov().iloc[0, 1]/train[asst2].var()
    return beta

In [14]:
# Function to get the spread data
def get_spread_limits_for_past_months(dataframe, pair, beta):
    asst1 = pair[0]
    asst2 = pair[1]
    asst1_mean = dataframe[asst1].mean() 
    asst2_mean = dataframe[asst2].mean() 
    spread_data = None
    order = []
    if asst1_mean > asst2_mean:
        spread_data = dataframe[asst1]-beta*dataframe[asst2]
        order = [asst1, asst2]
    else:
        spread_data = dataframe[asst2]-beta*dataframe[asst1]
        order = [asst2, asst1]
    mean = spread_data.mean()
    std_dev = spread_data.std()
    lower_limit = mean - (2*std_dev)
    upper_limit = mean + (2*std_dev)
    
    return upper_limit, lower_limit, order, mean

In [15]:
def print_spread_charts(data_series, upper_bound, lower_bound, mean_value, pair_name):
    # Extract dates and values
    dates = data_series.index
    values = data_series.values
    
    # Plot the data
    plt.figure(figsize=(10, 6))
    
    # Plot the average line
    plt.axhline(y=mean_value, color='blue', linestyle='-', linewidth=1, label='Mean for Past 3 months')
    
    # Plot the dashed lines for ±2 standard deviations
    plt.axhline(y=upper_bound, color='red', linestyle='--', linewidth=1, label='+2 Std Dev')
    plt.axhline(y=lower_bound, color='red', linestyle='--', linewidth=1, label='-2 Std Dev')
    
    # Plot the individual data points
    plt.scatter(dates, values, color='black', zorder=5)
    
    # Annotate the plot
    plt.title(f'Spread data for: {pair_name} for monitoring week')
    plt.xlabel('Date')
    plt.ylabel('Value')
    plt.legend()
    
    # Display the plot
    plt.grid(True)
    plt.show()

In [50]:
def monitor_pair_for_week(pair, pair_dict, start_date_index, end_date_index, curr_week_start_index, currently_trading_stocks):
    # Access the data
    global prices_with_market
    # Get dates as strings
    start_date_string = prices_with_market.index[start_date_index].strftime('%Y-%m-%d')
    end_date_string = prices_with_market.index[end_date_index].strftime('%Y-%m-%d')
    curr_week_date_string = prices_with_market.index[curr_week_start_index].strftime('%Y-%m-%d')
    # Get data
    forward_three_months_data = prices_with_market[start_date_index+7 : end_date_index]
    past_three_month_data = prices_with_market[start_date_index : curr_week_start_index]
    # Check for nulls 
    nulls_in_three_months_for_stock1 = forward_three_months_data[pair[0]].isnull().sum()
    nulls_in_three_months_for_stock2 = forward_three_months_data[pair[1]].isnull().sum()
    if nulls_in_three_months_for_stock1 > 0 or nulls_in_three_months_for_stock2 > 0:
        pair_dict['End_dates'].append(end_date_string)
        pair_dict['Still_consecutive'] = False
        pair_dict['Should_monitor'] = False
        pair_dict['Status_message'] = "Null Values: Can't Trade"
        return pair_dict
    else:
        # Check if they are still cointegrated
        coint_at_start_week = is_still_conintegrated(past_three_month_data, pair, 0.05)
        if coint_at_start_week:
            pass
        else:
            pass

In [45]:
def monitor_group_of_pairs(coint_dict, stop_after_weeks):
    # Access the data
    global prices_with_market
    # List of currently trading stocks
    currently_trading_stocks = []
    # variable to track the weeks running
    weeks_running = 0
    while (weeks_running < stop_after_weeks):
        print(f"---------------------------- WEEKS RUNNING {weeks_running + 1} -------------------------------------")
        # Calculate index
        start_date_index = ((weeks_running+1)*7)
        end_date_index = ((weeks_running+1)*7) + 97
        curr_week_start_index = ((weeks_running+1)*7) + 89
        # Get data 
        three_month_plus_one_week_df = prices_with_market[start_date_index : end_date_index]

        # Iterate over every pair in pair_dict
        for key in coint_dict:
            if coint_dict[key]['Still_consecutive']:
                updated_vals = monitor_pair_for_week(key, coint_dict[key], start_date_index, end_date_index, curr_week_start_index, currently_trading_stocks)
                coint_dict[key] = updated_vals
        # Increment weeks_running
        weeks_running +=1
        

In [18]:
# FIXME!!
def print_overall_display(pairs_dictionary):
    data = pairs_dictionary

    # Function to convert date strings to datetime objects
    def str_to_date(date_str):
        return datetime.strptime(date_str, '%Y-%m-%d')

    # Determine the number of plots needed
    num_keys = len(data)
    num_plots = (num_keys + 4) // 5  # Calculate the number of plots needed, rounding up

    # Iterate over the required number of plots
    for plot_index in range(num_plots):
        fig, ax = plt.subplots(figsize=(10, 5))

        start_index = plot_index * 5
        end_index = min(start_index + 5, num_keys)

        # Iterate over the data to plot
        for idx, (key, value) in enumerate(list(data.items())[start_index:end_index]):
            actual_idx = start_index + idx
            label = f"{key[0]}-{key[1]}"
        
            # Extract periods and events
            periods_start = [str_to_date(date) for date in value[2]]
            periods_end = [str_to_date(date) for date in value[3]]
            event_lists = value[3]
        
            # Plot each period as a line
            for start, end in zip(periods_start, periods_end):
                ax.plot([start, end], [idx, idx], color='black')
        
            # Plot each event as a dot
            for events in event_lists:
                event_dates = [str_to_date(date) for date in events]
                ax.plot(event_dates, [idx] * len(event_dates), 'o', color='red')

            # Add the label for the pair on the Y-axis
            #ax.text(periods_start[0], idx, label, verticalalignment='center', fontsize=12, horizontalalignment='right')

        # Formatting the plot
        ax.set_yticks(range(5))
        ax.set_yticklabels([f"{key[1]}-{key[2]}" for key, _ in list(data.items())[start_index:end_index]], fontsize=12, horizontalalignment='right')
        ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
        plt.xticks(rotation=45) 
        plt.xlabel('Date')
        plt.title(f'Event Timelines (Plot {plot_index + 1})')
        plt.tight_layout()
        plt.show()


# Running Functions

In [55]:
coint_dict = {
    ("INTC", "ORCL") : {
        "Still_consecutive" : True,
        "Should_monitor" : True,
        "Can_Trade" : True,
        "Status_message" : "Open to trade",
        "Start_dates" : [],
        "End_dates" : [],
        "Open_trade_days" : [],
        "Close_trade_days" : []
    }
}

In [56]:
results = monitor_group_of_pairs(coint_dict, 5)

---------------------------- WEEKS RUNNING 1 -------------------------------------
{'Still_consecutive': True, 'Should_monitor': True, 'Can_Trade': True, 'Status_message': 'Open to trade', 'Start_dates': [], 'End_dates': [], 'Open_trade_days': [], 'Close_trade_days': []}
---------------------------- WEEKS RUNNING 2 -------------------------------------
{'Still_consecutive': True, 'Should_monitor': True, 'Can_Trade': True, 'Status_message': 'Open to trade', 'Start_dates': [], 'End_dates': [], 'Open_trade_days': [], 'Close_trade_days': []}
---------------------------- WEEKS RUNNING 3 -------------------------------------
{'Still_consecutive': True, 'Should_monitor': True, 'Can_Trade': True, 'Status_message': 'Open to trade', 'Start_dates': [], 'End_dates': [], 'Open_trade_days': [], 'Close_trade_days': []}
---------------------------- WEEKS RUNNING 4 -------------------------------------
{'Still_consecutive': True, 'Should_monitor': True, 'Can_Trade': True, 'Status_message': 'Open to tra