In [1]:
import pandas as pd 
import polars as pl
import numpy as np
import networkx as nx
import community.community_louvain as community_louvain
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations


combined_df = pd.read_csv('/Users/othmaneio/Documents/financial_big_data/combined_stock_data.csv')

  from pandas.core import (


In [2]:
def calculate_market_data(df: pl.DataFrame) -> pl.DataFrame:
    """
    Aggregate stock data into market-level features for each minute.
    
    Parameters:
    - df: Polars DataFrame containing stock data with necessary columns.
    
    Returns:
    - market_data: Polars DataFrame with aggregated market-level features.
    """
    market_data = (
        df.group_by('minute')
        .agg([
            # Price volatility (standard deviation of weighted trade prices)
            pl.col('weighted_trade_price').std().alias('price_volatility'),

            # Return volatility (volume-weighted standard deviation of returns)
            (
                (pl.col('return') * pl.col('total_trade_volume'))
                .sum()
                / pl.col('total_trade_volume').sum()
            ).alias('avg_return'),
            pl.col('return').std().alias('return_volatility'),

            

            # Trade volume statistics
            pl.col('total_trade_volume').mean().alias('avg_trade_volume'),
            pl.col('total_trade_volume').std().alias('volume_volatility'),

            # Average bid-ask spread
            ((pl.col('weighted_ask_price') - pl.col('weighted_bid_price')).mean())
            .alias('avg_spread'),

            # Proportion of advancing stocks (positive returns)
            (pl.col('return') > 0).mean().alias('advancing_stocks'),
        ])
        .fill_null(0)  # Replace null values with 0
    )

    # Ensure the results are sorted by minute
    market_data = market_data.sort('minute')

    return market_data

In [3]:
#prepare market features
combined_df = pl.DataFrame(combined_df)
market_data = calculate_market_data(combined_df)



In [4]:
market_data

minute,price_volatility,avg_return,return_volatility,avg_trade_volume,volume_volatility,avg_spread,advancing_stocks
str,f64,f64,f64,f64,f64,f64,f64
"""2010-05-03 09:30:00-04:00""",44.178432,0.004845,0.005309,397540.447619,2.9211e6,0.180964,0.627907
"""2010-05-03 09:31:00-04:00""",42.087616,0.00036,0.002905,147834.982906,894941.945331,0.116104,0.6
"""2010-05-03 09:32:00-04:00""",44.210426,-0.002906,0.00259,271919.94958,2.4171e6,0.077482,0.529915
"""2010-05-03 09:33:00-04:00""",43.771961,-0.004586,0.002798,210191.362903,1.9084e6,0.088461,0.596774
"""2010-05-03 09:34:00-04:00""",44.20407,-0.000179,0.001869,315054.5,3.0612e6,0.077935,0.5
…,…,…,…,…,…,…,…
"""2010-05-28 15:56:00-04:00""",30.515974,-0.002326,0.000879,66782.308943,194534.002545,0.02457,0.00813
"""2010-05-28 15:57:00-04:00""",36.84272,-0.001509,0.000694,123828.325397,513753.591724,0.031873,0.007937
"""2010-05-28 15:58:00-04:00""",30.250318,-0.001353,0.000703,90634.046875,299053.569279,0.022169,0.03125
"""2010-05-28 15:59:00-04:00""",30.312197,-0.000508,0.000916,216019.84252,1.2109e6,0.029367,0.251969


In [5]:

def create_sparse_similarity_lazyframe(feature_df: pl.DataFrame, threshold: float = 0.5):
    """
    Create a sparse similarity matrix with optimization for large datasets.

    Parameters:
    - feature_df: Polars DataFrame containing the features.
    - threshold: Minimum similarity value to include in the sparse matrix.

    Returns:
    - LazyFrame with columns [row, col, similarity].
    """
    # Select features and handle NaNs
    clustering_features = [
        'price_volatility',
        'return_volatility',
        'avg_return',
        'avg_trade_volume',
        'volume_volatility',
        'advancing_stocks',
        'avg_spread'
    ]
    features = feature_df.select(clustering_features).to_numpy()
    
    # Scale features
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)
    
    # Create a Polars DataFrame with scaled features
    scaled_df = pl.DataFrame(features_scaled, schema=clustering_features)

    # Generate sparse row combinations
    indices = list(combinations(range(len(scaled_df)), 2))

    # Compute similarity only for selected pairs
    similarities = []
    for i, j in indices:
        dot_product = np.dot(features_scaled[i], features_scaled[j])
        norm_i = np.linalg.norm(features_scaled[i])
        norm_j = np.linalg.norm(features_scaled[j])
        similarity = dot_product / (norm_i * norm_j)
        if similarity > threshold:
            similarities.append((i, j, similarity))
    
    # Create a Polars LazyFrame from sparse matrix
    sparse_df = pl.DataFrame(similarities, schema=["row", "row_other", "similarity"]).lazy()
    return sparse_df

In [None]:
slf = create_sparse_similarity_lazyframe(market_data, threshold=0.5)

In [12]:
import polars as pl
import networkx as nx
import community.community_louvain as community_louvain

def identify_market_states(similarity_lazyframe: pl.LazyFrame):
    """
    Perform Louvain clustering using a precomputed sparse similarity matrix.

    Parameters:
    - similarity_lazyframe: Polars LazyFrame with columns [row, row_other, similarity].

    Returns:
    - Polars DataFrame with cluster assignments for each node.
    """
    # Collect the LazyFrame into a DataFrame for processing
    similarity_df = similarity_lazyframe.collect()



    # Create a graph from the sparse similarity matrix
    G = nx.Graph()
    for row in similarity_df.iter_rows(named=True):
        G.add_edge(row['row'], row['row_other'], weight=row['similarity'])

    print('Graph created')
    # Perform Louvain clustering
    partition = community_louvain.best_partition(G, weight='weight')

    # Convert the partition dictionary to a Polars DataFrame
    clusters_df = pl.DataFrame(
        {"node": list(partition.keys()), "cluster": list(partition.values())}
    )

    return G, clusters_df

: 

In [13]:
market_states = identify_market_states(slf)

Graph created


In [27]:
G = market_states[0]
market_states = market_states[1]

In [31]:
# Join the cluster assignments with the original dataset


market_features_pd = market_data.to_pandas().reset_index(names="node")
market_states_pd = market_states.to_pandas()

linked_data= pd.merge(market_features_pd, market_states_pd, on="node", how="inner")

In [39]:
linked_data.head()

Unnamed: 0,node,minute,price_volatility,avg_return,return_volatility,avg_trade_volume,volume_volatility,avg_spread,advancing_stocks,cluster
0,0,2010-05-03 09:30:00-04:00,44.178432,0.004845,0.005309,397540.447619,2921071.0,0.180964,0.627907,1
1,1,2010-05-03 09:31:00-04:00,42.087616,0.00036,0.002905,147834.982906,894941.9,0.116104,0.6,3
2,2,2010-05-03 09:32:00-04:00,44.210426,-0.002906,0.00259,271919.94958,2417100.0,0.077482,0.529915,1
3,3,2010-05-03 09:33:00-04:00,43.771961,-0.004586,0.002798,210191.362903,1908430.0,0.088461,0.596774,1
4,4,2010-05-03 09:34:00-04:00,44.20407,-0.000179,0.001869,315054.5,3061200.0,0.077935,0.5,1


In [None]:
#AGGREGRATE FEATURES PER STATE
combined_df = combined_df.astype({'minute': 'str', 'stock': 'str'})
cdf = pd.merge(combined_df, linked_data[['minute', 'cluster']], on='minute')
cdf
