<a href="https://colab.research.google.com/github/Vorlde/Pairs_trading-Reinforcement-Learning/blob/main/Pair_Selection_data_process.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
! pip install dill

import pandas as pd
import numpy as np
from datetime import datetime, timedelta


from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn import preprocessing

from statsmodels.tsa.stattools import coint

from scipy import stats

Collecting dill
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dill
Successfully installed dill-0.3.8


In [4]:
import lzma
import dill as pickle

def save_pickle(path,obj):
    with lzma.open(path,"wb") as fp:
        pickle.dump(obj,fp)

def load_pickle(path):
    with lzma.open(path,"rb") as fp:
        file = pickle.load(fp)
    return file


def clean_data(ticker_dfs,tickers):
    intraday_range = ticker_dfs[tickers[0]].index
    for inst in tickers:
        ticker_dfs[inst] = ticker_dfs[inst].reindex(intraday_range)
    closes = []

    for tk in tickers:
        close = ticker_dfs[tk].close
        closes.append(close)

    pricing = pd.concat(closes,axis = 1)
    pricing.columns = tickers

    return pricing




def get_pca_features(ret_df,N_PRIN_COMPONENTS =10):

    pca = PCA(n_components=N_PRIN_COMPONENTS)
    pca.fit(ret_df)

    # Extract factor loadings
    factor_loadings = pca.components_.T  # Transpose the components matrix

    # Create a DataFrame with the correct orientation
    factor_loadings_df = pd.DataFrame(factor_loadings, index=ret_df.columns, columns=[f'Factor {i+1}' for i in range(N_PRIN_COMPONENTS)])

    X = preprocessing.StandardScaler().fit_transform(pca.components_.T)

    return X

def create_clusters(X,index):
    clf = DBSCAN(eps=1, min_samples=3)
    print(clf)

    clf.fit(X)
    labels = clf.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    print("\nClusters discovered: %d" % n_clusters_)

    clustered = clf.labels_

    clustered_series = pd.Series(index=index, data=clustered.flatten())
    clustered_series = clustered_series[clustered_series != -1]

    return clustered_series


def find_cointegrated_pairs(data, significance=0.05):
    # This function is from https://www.quantopian.com/lectures/introduction-to-pairs-trading
    n = data.shape[1]
    score_matrix = np.zeros((n, n))
    pvalue_matrix = np.ones((n, n))
    keys = data.keys()
    pairs = []
    for i in range(n):
        for j in range(i+1, n):
            S1 = data[keys[i]]
            S2 = data[keys[j]]
            result = coint(S1, S2)
            score = result[0]
            pvalue = result[1]
            score_matrix[i, j] = score
            pvalue_matrix[i, j] = pvalue
            if pvalue < significance:
                pairs.append((keys[i], keys[j]))
    return score_matrix, pvalue_matrix, pairs



def get_coint_pairs(prices,clustered_series):

    CLUSTER_SIZE_LIMIT = 9999
    counts = clustered_series.value_counts()
    ticker_count_reduced = counts[(counts>1) & (counts<=CLUSTER_SIZE_LIMIT)]

    cluster_dict = {}
    for i, which_clust in enumerate(ticker_count_reduced.index):
        tickers = clustered_series[clustered_series == which_clust].index
        score_matrix, pvalue_matrix, pairs = find_cointegrated_pairs(
            prices[tickers]
        )
        cluster_dict[which_clust] = {}
        cluster_dict[which_clust]['score_matrix'] = score_matrix
        cluster_dict[which_clust]['pvalue_matrix'] = pvalue_matrix
        cluster_dict[which_clust]['pairs'] = pairs

    pairs = []
    for clust in cluster_dict.keys():
        pairs.extend(cluster_dict[clust]['pairs'])

    return pairs


def get_data():
    data_path = "/content/drive/My Drive/constituents.csv"
    dfs_path = "/content/drive/My Drive/new_dfs.obj"

    ticker_dfs = load_pickle(dfs_path)
    snp_data = pd.read_csv(data_path)
    tickers = []

    for i in range(499):
      tickers.append(snp_data.Symbol[i])

    tickers.remove("BF.B")
    tickers.remove("BRK.B")
    tickers.remove("CPAY")
    tickers.remove("DAY")
    tickers.remove("GEV")
    tickers.remove("SOLV")

    return tickers,ticker_dfs

In [5]:
def process_pair_and_store(pair, prices, h5_file_path):

    # print(f"processing {pair}")
    s1, s2 = pair
    # datax = prices[s1]
    # datay = prices[s2]

    # price1_train = datax[:600]
    # price2_train = datay[:600]

    # # Conduct OLS regression to find the hedge ratio
    # results = sm.OLS(price1_train, sm.add_constant(price2_train)).fit()
    # hedge_ratio = results.params[1]  # The coefficient (beta) for price2

    # # Create a DataFrame with the features you want to use
    # # For example, hedge_ratio could be one of the features
    # features_df = pd.DataFrame({
    #     'hedge_ratio': hedge_ratio,
    #     # Add more features here
    # })

    # # Store this DataFrame in the h5py file
    # with h5py.File(h5_file_path, 'a') as h5f:
    #     group = h5f.require_group('cointegrated_pairs')
    #     dataset_name = f"{s1}_{s2}"
    #     # If dataset already exists, delete it before creating a new one
    #     if dataset_name in group:
    #         del group[dataset_name]
    #     group.create_dataset(dataset_name, data=features_df.values)


    # import statsmodels.api as sm
    # from statsmodels.tsa.stattools import adfuller as ADF_test
    # import matplotlib.pyplot as plt

    # datax = prices[s1]
    # datay = prices[s2]

    # price1_train = datax[:550]
    # price2_train = datay[:550]
    # price1_test = datax[550:]
    # price2_test = datay[550:]

    # # Conduct OLS regression to find the hedge ratio
    # # Adding an intercept to model: price1 = beta * price2 + alpha
    # results = sm.OLS(price1_train, sm.add_constant(price2_train)).fit()
    # hedge_ratio = results.params[1]  # The coefficient (beta) for price2

    # # Calculate the spread using the hedge ratio for both training and test data
    # spread_train = price1_train - hedge_ratio * price2_train
    # spread_test = price1_test - hedge_ratio * price2_test
    # full_spread = pd.concat([spread_train, spread_test]).reset_index(drop=True)

    # # Perform the Augmented Dickey-Fuller (ADF) test on the training spread
    # adf_result = ADF_test(spread_train)

    # # Check if the training spread is stationary
    # if adf_result[1] < 0.5:  # ADF test returns a tuple where the second item is p-value
    #     print(ADF_test(spread_train)[1])
    #     print(ADF_test(spread_test)[1])

    #     # Calculate the mean and standard deviation of the full spread
    #     mu = full_spread.mean()
    #     sigma = full_spread.std()

    #     # Calculate the Z-scores of the full spread
    #     z_scores = (full_spread - mu) / sigma

    #     # Plotting the spread and Z-scores for full dataset
    #     plt.figure(figsize=(15, 7))
    #     plt.title(f'Z-Scores for {s1} and {s2} Across Training and Testing Periods')
    #     z_scores.plot()
    #     plt.axvline(x=600, color='purple', linestyle='--', label='Training/Test Split')
    #     plt.axhline(z_scores.mean(), color='black', label='Mean')  # Mean line
    #     plt.axhline(2, color='red', linestyle='--', label='+2 Std Dev')  # Upper threshold
    #     plt.axhline(-2, color='green', linestyle='--', label='-2 Std Dev')  # Lower threshold
    #     plt.legend()
    #     plt.show()



In [6]:
from google.colab import drive
drive.mount('/content/drive')

tickers,ticker_dfs = get_data()

Mounted at /content/drive


In [12]:
def generate_trading_data_blocks(df, intervals_per_day=78):
    """
    Generate blocks of data consisting of 9 trading days for each month.
    """
    j = 0
    for year in df.index.year.unique():
        for month in df[df.index.year == year].index.month.unique():
            monthly_data = df[(df.index.year == year) & (df.index.month == month)]
            trading_days = monthly_data.index.day.unique()

            # Determine the block indices
            block_start_indices = [i * intervals_per_day * 9 for i in range((len(trading_days) + 8) // 9)]
            for start_day in block_start_indices:
                # Calculate the block's end day, being careful not to exceed the available days
                end_day = min(start_day + 9, len(trading_days))
                block_days = trading_days[start_day:end_day]
                # Yield the data for these specific trading days
                block_data = monthly_data[monthly_data.index.day.isin(block_days)]
                yield j, year, month, block_data
                j+=1


def generate_trading_data_blocks(df):
    # Define the number of intervals per trading day
    intervals_per_day = 78
    j = 0

    for year in df.index.year.unique():
        yearly_data = df[df.index.year == year]
        for month in yearly_data.index.month.unique():
            monthly_data = yearly_data[yearly_data.index.month == month]

            # Print statement for where clustering code will go
            print(f"Clustering code for {year}-{month} goes here.")

            # Extract the unique days in the monthly data
            trading_days = monthly_data.index.day.unique()

            # Initialize the start index for the first block
            start_idx = 0
            end_idx = intervals_per_day * 9  # 9 days worth of intervals

            # Generate two blocks if possible
            for _ in range(2):
                # Ensure we have enough data for a full block
                if end_idx <= len(monthly_data):
                    block_data = monthly_data.iloc[start_idx:end_idx]
                    yield j, year, month, block_data
                    start_idx = end_idx
                    end_idx += intervals_per_day * 9
                    j+=1

def generate_trading_data_blocks(df, days_per_block=9, step_days=5):
    """
    Generate rolling blocks of data consisting of 9 trading days,
    advancing the window by 5 days after each block from the original dataframe.
    """
    # Extract unique trading days from the index
    unique_days = df.index.normalize().unique()

    for start_day_idx in range(0, len(unique_days) - days_per_block + 1, step_days):
        # Identify start and end day for the block
        start_day = unique_days[start_day_idx]
        end_day_idx = start_day_idx + days_per_block - 1  # inclusive end day index
        if end_day_idx >= len(unique_days):  # Ensure we don't go out of bounds
            break
        end_day = unique_days[end_day_idx]

        # Slice the original dataframe to get the block data
        block_data = df[start_day:end_day]
        yield block_data


def calculate_monthly_clusters(ret_df):
    """
    Calculate clusters for the given month.
    """
    X = get_pca_features(ret_df)
    clustered_series = create_clusters(X, ret_df.columns)

    return clustered_series


In [18]:
# Main driver function to integrate the generation of trading blocks, cluster calculation, and pair processing
def main_driver_function(prices, h5_file_path):

    for block_data in generate_trading_data_blocks(prices):
      if block_data.shape[0] ==624:
        print(block_data.shape[0])

        block_data = block_data.dropna(axis=1)

        input(block_data.shape)

        # ret_df = block_data.pct_change().round(4).fillna(0)
        # cluster = calculate_monthly_clusters(ret_df)

        # if j!=0:
        #   print(f"going for {year}, {month}")
        #   pairs = get_coint_pairs(block_data, cluster)
        #   process_pair_and_store(pairs, block_data, h5_file_path)


h5_file_path = 'path_to_h5_file.h5'  # Define the correct path
intraday_dfs = ticker_dfs.copy()
prices = clean_data(intraday_dfs,tickers)
main_driver_function(prices[-3000:], 'path_to_h5_file.h5')

624


KeyboardInterrupt: Interrupted by user