## Analytics

This file implements statistical analysis & trading indicators on OHLC data

In [1]:
# Imports

import matplotlib.pyplot as plt
import scipy.stats as stats
import mplfinance as mpf
import pandas as pd
import numpy as np

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


## Trading Indicators

Trading indicators are statistical calculations applied to price and volume data of financial instruments to help traders identify trends, momentum, and potential buy or sell signals. They are widely used in technical analysis to support decision-making in trading. 

Common indicators include:

- **Moving Averages (SMA/EMA):**  
  Smooth out price data to highlight trends over a specific period. SMA gives equal weight to all prices, while EMA gives more weight to recent prices.


- **Bollinger Bands:**  
  Measure price volatility by plotting a moving average with upper and lower bands based on standard deviations.


- **Relative Strength Index (RSI):**  
  A momentum oscillator that measures the speed and change of price movements. RSI values range from 0 to 100 and help identify overbought or oversold conditions, signaling potential reversals.


- **MACD (Moving Average Convergence Divergence):**  
  Shows the relationship between two moving averages of a security’s price. It is used to detect trend changes, momentum, and potential buy or sell signals through MACD line, signal line, and histogram analysis.


**Example of output** (computing 3-period SMA on a `close` column):  

```python
Input DataFrame: 
timestamp            close     
'2025-08-19 10:00'   100.0
'2025-08-19 10:01'   102.0
'2025-08-19 10:02'   104.0
'2025-08-19 10:03'   103.0
'2025-08-19 10:04'   101.0

Simple moving average (window=3): (NaN, NaN, 102.00, 103.00, 102.67)
```

These indicators are not predictive on their own but provide insights into market behavior and can be combined with other analysis tools.

In [2]:
class Analytics:
    @staticmethod
    def moving_average(df: pd.DataFrame, column: str, window: int):
        """
        Computing simple moving average (SMA) over the 'column' price.
        """
        if column not in df.columns:
            raise ValueError(f"The column {column} does not appear in {list(df.columns)}")
        
        # Using the pandas built-in moving average function
        return df[column].rolling(window=window).mean()

    @staticmethod
    def exponential_ma(df: pd.DataFrame, column: str, window: int):
        """
        Computing exponential moving average (EMA) over the 'column' price.
        """
        if column not in df.columns:
            raise ValueError(f"The column {column} does not appear in {list(df.columns)}")
        
        # Using the pandas built-in exponential moving average function
        return df[column].ewm(span=window, adjust=False).mean()

    @staticmethod
    def bollinger_bands(df: pd.DataFrame, column: str, window: int, factor: float = 2):
        """
        Computing Bollinger Bands (middle, upper, lower) over the 'column' price.
        """
        if column not in df.columns:
            raise ValueError(f"DataFrame must contain a {column} column")

        ma = df[column].rolling(window=window).mean()
        std = df[column].rolling(window=window).std()
        
        # Moving average, upper band, lower band
        return ma, ma + (std * factor), ma - (std * factor)

## Correlation Matrix

A correlation matrix shows the pairwise relationships between multiple financial instruments, typically using the **Pearson correlation coefficient**. Values range from -1 to 1, where 1 indicates perfect positive correlation, -1 indicates perfect negative correlation, and 0 indicates no linear relationship.  

Correlation matrices are useful to identify which assets move together, diversify a portfolio, or detect clusters of highly correlated symbols. Visualizing them with **heatmaps** makes it easy to spot strong correlations and patterns across many instruments at a glance.

In [3]:
def correlation_matrix(data: dict, column: str):
    """
    Computing the correlations of products based on a certain column
    """
    # Extract closing prices into a single DataFrame
    values = {}

    for symbol, df in data.items():
        d = df[column]

        if d.isna().any():
            continue

        values[symbol] = d

    prices = pd.concat(
        values,
        axis=1
    ).dropna()

    # Compute returns
    returns = prices.pct_change().dropna()

    # Correlation matrix
    corr = returns.corr()

    return corr

## Basic Statistics & Visualization

Computing basic statistics provides a quick overview of the characteristics of financial time series data

In [4]:
def compute_stats(data: dict[pd.DataFrame]):
    """
    Computes basic statistics on the passed dict containing OHLC data
    """
    stats = {}
    
    for symbol, df in data.items():
        returns = df["CLOSE"].pct_change()  # Daily returns
        
        stats[symbol] = {
            "CLOSE_MEAN": df["CLOSE"].mean(),
            "CLOSE_STD": df["CLOSE"].std(),
            "CLOSE_MIN": df["CLOSE"].min(),
            "CLOSE_MAX": df["CLOSE"].max(),
            "VOLUME_MEAN": df["VOLUME"].mean(),
            "RETURN_MEAN": returns.mean(),
            "RETURN_STD": returns.std(),
        }
        
    return pd.DataFrame(stats)

In [5]:
def plot_return_stats(stats_df: pd.DataFrame):
    """
    Plots RETURN_MEAN and RETURN_STD only, ordered by value.
    """
    metrics = {"RETURN_MEAN": "Average mean return", "RETURN_STD": "Return standard deviation"}
    colors = ('purple', 'plum')
    
    fig, axes = plt.subplots(len(metrics), 1, figsize=(10, 6))
    
    for ax, (metric, alias), color in zip(axes, metrics.items(), colors):
        values = stats_df.loc[metric].sort_values(ascending=False)
        values.plot(kind="bar", ax=ax, color=color)
        ax.set_title(alias)
        ax.set_ylabel("Value")
        ax.grid(True, linestyle="--", alpha=0.6)
        ax.set_xticklabels(values.index, rotation=60, ha="center")
        
    plt.tight_layout()
    plt.show()

## Distribution Analysis

Distribution analysis examines the statistical properties of daily returns for each symbol. It helps identify asymmetry, tail behavior, and deviations from a normal distribution.  

By visualizing the distributions with histograms and comparing them to a theoretical normal distribution using QQ-plots, we can quickly assess whether returns are skewed, heavy-tailed, or approximately Gaussian. Skewness and kurtosis provide numerical summaries of these characteristics.

In [6]:
def distribution_analysis(df: pd.DataFrame):
    """
    Analyze the distribution of daily returns for a given symbol.
    
    Includes histogram, skewness, kurtosis, QQ-plots, and normality test.
    """
    returns = df["CLOSE"].pct_change().dropna()
    
    # Compute skewness and kurtosis
    skewness = returns.skew()
    kurtosis = returns.kurtosis()
    
    # Normality test (Shapiro-Wilk)
    stat, p_value = stats.shapiro(returns)
        
    # Compute mean and std
    mu, sigma = returns.mean(), returns.std()
    
    # Create figure with GridSpec for one column (hist) and one column (two QQ plots)
    fig = plt.figure(figsize=(14, 6))
    gs = fig.add_gridspec(2, 2, width_ratios=[1, 1], height_ratios=[1, 1])
    
    # Histogram on the left (spanning both rows)
    ax_hist = fig.add_subplot(gs[:, 0])
    n, bins, _ = ax_hist.hist(returns, bins=50, density=True, color="lightblue", edgecolor="black", alpha=0.7)
    x = np.linspace(bins[0], bins[-1], 100)
    ax_hist.plot(x, stats.norm.pdf(x, mu, sigma), 'r--', linewidth=2, label='Theoretical normal PDF')
    ax_hist.set_title("Daily Returns Histogram")
    ax_hist.set_xlabel("Daily Return")
    ax_hist.set_ylabel("Density")
    ax_hist.legend()
    
    # QQ-plot of actual data (top-right)
    ax_qq1 = fig.add_subplot(gs[0, 1])
    res = stats.probplot(returns, dist="norm")
    ax_qq1.plot(res[0][0], res[0][1], 'o', color='blue', label='Returns')
    ax_qq1.plot(res[0][0], res[1][0]*res[0][0] + res[1][1], 'r--', linewidth=2, label='Fit')
    ax_qq1.set_title("QQ-Plot: Data")
    ax_qq1.legend()
    
    # QQ-plot of theoretical normal (bottom-right)
    ax_qq2 = fig.add_subplot(gs[1, 1])
    normal_sample = np.random.normal(mu, sigma, 500)
    res_norm = stats.probplot(normal_sample, dist="norm")
    ax_qq2.plot(res_norm[0][0], res_norm[0][1], 'o', color='green', label='Theoretical Normal Sample')
    ax_qq2.plot(res_norm[0][0], res_norm[1][0]*res_norm[0][0] + res_norm[1][1], 'r--', linewidth=2, label='Fit')
    ax_qq2.set_title("Normal distribution QQ-plot")
    ax_qq2.legend()
    
    plt.tight_layout()
    plt.show()
    
    return skewness, kurtosis, p_value

## Candlestick chart

In [7]:
def candlesticks(symbol_df: pd.DataFrame):
    """
    Plotting candelstick chart
    """
    for column_name in ('Open', 'High', 'Low', 'Close', 'Volume', 'Timestamp'):
        assert column_name in symbol_df.columns, f"Dataframe must contain a column {column_name}"

    symbol_df.index = [pd.to_datetime(d) for d in symbol_df['Timestamp']]

    # Plot candlestick chart
    mpf.plot(symbol_df, 
             type='candle', 
             volume=True, 
             style='binance', 
             title='CANDELSTICK CHART', 
             ylabel='PRICE', 
             ylabel_lower='VOLUME')