In [None]:
import pandas as pd
import networkx as nx
from itertools import combinations

from typing import Dict, Union, Tuple, Optional




# Example usage:
# if __name__ == "__main__":
#     # Example of how to use the function
#     # Assuming you have a dictionary like this:
#     # stock_data = {
#     #     'AAPL': dataframe_with_multiindex_columns,
#     #     'GOOGL': dataframe_with_multiindex_columns,
#     #     'MSFT': dataframe_with_multiindex_columns
#     # }
    
#     # Create correlation network for last 30 days
#     # G = create_stock_correlation_network(stock_data, n_days=30, correlation_threshold=0.3)
    
#     # Analyze the network
#     # analysis = analyze_correlation_network(G)
#     # print(analysis)
    
#     # Visualize (requires matplotlib and networkx)
#     # import matplotlib.pyplot as plt
#     # pos = nx.spring_layout(G)
#     # edge_weights = [G[u][v]['correlation'] for u, v in G.edges()]
#     # nx.draw(G, pos, with_labels=True, edge_color=edge_weights, 
#     #         edge_cmap=plt.cm.RdYlBu, node_color='lightblue')
#     # plt.show()
    
#     pass

In [None]:
def log_change(k): 
    return np.log(k).diff() 
def pct_change(k):
    return k.pct_change()

In [None]:
def _organise_price_data(stock_data_dict, n_days):
        # Extract close prices for all stocks
    price_data = {}
    
    for stock_symbol, df in stock_data_dict.items():
        try:
            # Get the 'Close' price column for this stock
            if ('Close', stock_symbol) in df.columns:
                close_prices = df[('Close', stock_symbol)]
            elif 'Close' in df.columns:
                # Handle case where there might be a single-level index
                close_prices = df['Close']
            else:
                print(f"Warning: No 'Close' price found for {stock_symbol}")
                continue
                
            # Get last n days of data
            recent_prices = close_prices.tail(n_days)
            
            # Only include if we have enough data points
            if len(recent_prices) >= min(n_days, 2):
                price_data[stock_symbol] = recent_prices
            else:
                print(f"Warning: Insufficient data for {stock_symbol} (only {len(recent_prices)} days)")
                
        except Exception as e:
            print(f"Error processing {stock_symbol}: {e}")
            continue
    
    if len(price_data) < 2:
        print("Error: Need at least 2 stocks with sufficient data")
        return nx.Graph()
    
    # Create DataFrame with all stock prices aligned by date
    combined_df = pd.DataFrame(price_data)
    
    # Remove any rows with NaN values
    return combined_df#.dropna()

In [8]:
def create_stock_correlation_network(stock_data_dict, n_days, correlation_threshold=0.0,output='graph',transformation='log'):
    """
    Calculate pairwise Pearson correlations between stocks over the last n days
    and return a NetworkX graph object.
    
    Parameters:
    -----------
    stock_data_dict : dict
        Dictionary where keys are stock symbols and values are DataFrames
        with MultiIndex columns ('Close', stock_symbol) for price data
    n_days : int
        Number of recent days to use for correlation calculation
    correlation_threshold : float, optional
        Minimum correlation value to include an edge in the network (default: 0.0)
        
    Returns:
    --------
    networkx.Graph
        Graph where nodes are stock symbols and edges represent correlations
        Edge weights are the correlation coefficients
    """
    
    combined_df=_organise_price_data(stock_data_dict, n_days)
    if len(combined_df) < 2:
        print("Error: Insufficient overlapping data points after removing NaN values")
        return None
    if transformation=='log':
        df=log_change(combined_df)#.dropna()
    elif transfomration=='percent':
        df=pct_change(combined_df)
    else:
        df=combined_df

    
    # Calculate correlation matrix
    correlation_matrix = df.corr()
    if output=='graph':
        # Create NetworkX graph
        G = nx.Graph()
        
        # Add nodes (stock symbols)
        G.add_nodes_from(correlation_matrix.index)
        
        # Add edges with correlation weights
        for stock1, stock2 in combinations(correlation_matrix.index, 2):
            correlation = correlation_matrix.loc[stock1, stock2]
            
            # Only add edge if correlation meets threshold and is not NaN
            if not pd.isna(correlation) and abs(correlation) >= abs(correlation_threshold):
                G.add_edge(stock1, stock2, weight=correlation, correlation=correlation)
        
        # Add node attributes with some basic stats
        for stock in G.nodes():
            recent_prices = price_data[stock]
            G.nodes[stock]['mean_price'] = recent_prices.mean()
            G.nodes[stock]['std_price'] = recent_prices.std()
            G.nodes[stock]['data_points'] = len(recent_prices)
        
        return G
    else:
        return correlation_matrix

In [None]:
def analyze_correlation_network(G, top_n=5):
    """
    Helper function to analyze the correlation network
    
    Parameters:
    -----------
    G : networkx.Graph
        The correlation network graph
    top_n : int
        Number of top correlations to display
        
    Returns:
    --------
    dict
        Dictionary with network analysis results
    """
    
    if len(G.edges()) == 0:
        return {"message": "No edges in the network"}
    
    # Get edge weights (correlations)
    correlations = [G[u][v]['correlation'] for u, v in G.edges()]
    
    # Sort edges by absolute correlation value
    sorted_edges = sorted(G.edges(data=True), 
                         key=lambda x: abs(x[2]['correlation']), 
                         reverse=True)
    
    analysis = {
        'num_nodes': G.number_of_nodes(),
        'num_edges': G.number_of_edges(),
        'avg_correlation': sum(correlations) / len(correlations),
        'max_correlation': max(correlations),
        'min_correlation': min(correlations),
        'top_correlations': []
    }
    
    # Get top correlations
    for i, (stock1, stock2, data) in enumerate(sorted_edges[:top_n]):
        analysis['top_correlations'].append({
            'stocks': (stock1, stock2),
            'correlation': data['correlation']
        })
    
    return analysis

In [None]:
# def calculate_series_stock_correlations(stock_data_dict: Dict[str, pd.DataFrame], 
#                                       reference_series: pd.Series,
#                                       n_days: int,
#                                       reference_symbol: str = None,
#                                       min_periods: int = 10,
#                                       return_pvalues: bool = False) -> pd.Series:
#     """
#     Calculate Pearson correlations between a reference series and each stock 
#     in the dictionary over the last n days.
    
#     Parameters:
#     -----------
#     stock_data_dict : dict
#         Dictionary where keys are stock symbols and values are DataFrames
#         with MultiIndex columns ('Close', stock_symbol) for price data
#     reference_series : pd.Series
#         Reference series to correlate against (e.g., market index, benchmark stock)
#         Should have datetime index and numeric values
#     n_days : int
#         Number of recent days to use for correlation calculation
#     reference_symbol : str, optional
#         Symbol name for the reference series (for identification)
#     min_periods : int, optional
#         Minimum number of overlapping periods required (default: 10)
#     return_pvalues : bool, optional
#         Whether to return p-values along with correlations (default: False)
        
#     Returns:
#     --------
#     pd.Series or tuple
#         If return_pvalues=False: Series with stock symbols as index and correlations as values
#         If return_pvalues=True: Tuple of (correlations_series, pvalues_series)
#     """
#     combined_df=_organise_price_data(stock_data_dict, n_days)
   
            
#             if len(stock_recent) < min_periods:
#                 print(f"Warning: {stock_symbol} has only {len(stock_recent)} days of data")
#                 correlations[stock_symbol] = np.nan
#                 if return_pvalues:
#                     pvalues[stock_symbol] = np.nan
#                 continue
            
#             # Align the series by their indices (dates)
#             aligned_data = pd.DataFrame({
#                 'reference': reference_recent,
#                 'stock': stock_recent
#             }).dropna()
            
#             if len(aligned_data) < min_periods:
#                 print(f"Warning: Only {len(aligned_data)} overlapping periods for {stock_symbol}")
#                 correlations[stock_symbol] = np.nan
#                 if return_pvalues:
#                     pvalues[stock_symbol] = np.nan
#                 continue
            
#             # Calculate Pearson correlation
#             if return_pvalues:
#                 from scipy.stats import pearsonr
#                 corr, p_val = pearsonr(aligned_data['reference'], aligned_data['stock'])
#                 correlations[stock_symbol] = corr
#                 pvalues[stock_symbol] = p_val
#             else:
#                 corr = aligned_data['reference'].corr(aligned_data['stock'])
#                 correlations[stock_symbol] = corr
                
#         except Exception as e:
#             print(f"Error processing {stock_symbol}: {e}")
#             correlations[stock_symbol] = np.nan
#             if return_pvalues:
#                 pvalues[stock_symbol] = np.nan
#             continue
    
#     # Create result series
#     corr_series = pd.Series(correlations, name=f'Correlation_with_{reference_symbol or "Reference"}')
#     corr_series.index.name = 'Stock_Symbol'
    
#     if return_pvalues:
#         pval_series = pd.Series(pvalues, name=f'P_Value_with_{reference_symbol or "Reference"}')
#         pval_series.index.name = 'Stock_Symbol'
#         return corr_series, pval_series
    
#     return corr_series
