In [7]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import os

In [8]:
df = pd.read_csv('./Data/BankSector_ohlcv.csv')

In [9]:
# Perform statistical tests (e.g., ANOVA) to check if the differences are significant

def anova_test(df, digit_column):
    groups = [group for _, group in df.groupby(digit_column)['return']]
    f_value, p_value = stats.f_oneway(*groups)
    return f_value, p_value

In [10]:
def get_digit(number, position):
        # Convert to integer
        num_int = int(number)
        if position == 'ones':
            return num_int % 10
        elif position == 'tens':
            return (num_int // 10) % 10

In [11]:
def analyze_digit_effect(df, save_path='DigitEffect'):
    # Ensure the save directory exists
    os.makedirs(save_path, exist_ok=True)

    # Apply the function for both open and close prices
    for price_type in ['open', 'close']:
        df[f'{price_type}_digit_ones'] = df[price_type].apply(lambda x: get_digit(x, 'ones'))
        df[f'{price_type}_digit_tens'] = df[price_type].apply(lambda x: get_digit(x, 'tens'))

    # Calculate next day's return
    df['next_day_return'] = df.groupby('ticker')['adjClose'].pct_change().shift(-1)

    # Align the data by dropping NaN values
    df = df.dropna(subset=['next_day_return'])

    def avg_return_by_digit(df, digit_column, return_column):
        return df.groupby(digit_column)[return_column].mean()

    # Visualize the results
    fig, axes = plt.subplots(2, 2, figsize=(20, 16))
    positions = ['ones', 'tens']
    price_types = ['open', 'close']
    return_period = 'next_day_return'

    # Set a style for the plots
    plt.style.use('ggplot')
    
    # Define a color palette
    colors = plt.cm.viridis(np.linspace(0, 1, 10))
    
    for row, price_type in enumerate(price_types):
        for col, position in enumerate(positions):
            column = f'{price_type}_digit_{position}'
            avg_returns = avg_return_by_digit(df, column, return_period)
            ax = axes[row, col]
            
            # Create a beautiful bar plot
            bars = ax.bar(range(10), avg_returns.reindex(range(10)).values, color=colors)
            
            # Add value labels on top of each bar
            for bar in bars:
                height = bar.get_height()
                ax.text(bar.get_x() + bar.get_width()/2., height,
                        f'{height:.2%}',
                        ha='center', va='bottom', rotation=0)
            
            # Perform ANOVA test
            groups = [group for _, group in df.groupby(column)[return_period]]
            f_value, p_value = stats.f_oneway(*groups)
            
            # Customize the plot
            ax.set_title(f'Average {return_period.replace("_", " ").title()}\nby {price_type.capitalize()} Price {position.capitalize()} Digit\n'
                         f'ANOVA: F={f_value:.4f}, p={p_value:.4f}',
                         fontsize=16, fontweight='bold')
            ax.set_xlabel('Digit', fontsize=10)
            ax.set_ylabel('Average Return', fontsize=10)
            ax.set_xticks(range(10))
            ax.set_xticklabels(range(10))
            ax.tick_params(axis='both', which='major', labelsize=8)
            ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: '{:.2%}'.format(y)))
            
            # Add a light grid
            ax.grid(True, linestyle='--', alpha=0.7)
            
            # Remove top and right spines
            ax.spines['top'].set_visible(False)
            ax.spines['right'].set_visible(False)

    plt.tight_layout()
    plt.savefig(f'{save_path}/average_returns.png')
    plt.close()

# Usage
df = pd.read_csv('BankSector_ohlcv.csv')
analyze_digit_effect(df)

In [12]:
def analyze_digit_distributions(df, save_path='DigitEffect'):
    # Ensure the save directory exists
    import os
    os.makedirs(save_path, exist_ok=True)

    # Create digit columns
    price_types = ['open', 'close']  # Only check open and close
    digit_positions = ['ones', 'tens']
    
    for price_type in price_types:
        df[f'{price_type}_digit_ones'] = df[price_type].apply(lambda x: get_digit(x, 'ones'))
        df[f'{price_type}_digit_tens'] = df[price_type].apply(lambda x: get_digit(x, 'tens'))

    # Function to get digit distribution
    def get_digit_distribution(df, column):
        return df[column].value_counts().sort_index() / len(df)

    # Get distributions for ones and tens digits
    distributions = {}

    for price_type in price_types:
        for position in digit_positions:
            column = f'{price_type}_digit_{position}'
            distributions[f'{price_type}_{position}'] = get_digit_distribution(df, column)

    # Plotting
    fig, axes = plt.subplots(len(price_types), 2, figsize=(20, 5*len(price_types)))
    fig.suptitle("Digit Distributions", fontsize=16)

    for i, price_type in enumerate(price_types):
        for j, position in enumerate(digit_positions):
            ax = axes[i, j]
            dist = distributions[f'{price_type}_{position}']
            
            # Plot actual distribution
            ax.bar(dist.index, dist.values, alpha=0.7, label='Actual', color='skyblue')
            
            # Perform chi-square test
            expected = pd.Series({d: 1/10 for d in range(10)})  # Uniform distribution
            
            # Reindex both actual and expected distributions to cover 0-9 for both ones and tens
            dist = dist.reindex(range(10), fill_value=0)
            expected = expected.reindex(range(10), fill_value=0)
            
            # Adjust expected frequencies to match the total observations
            expected = expected * dist.sum()
            
            # Perform chi-square test
            chi2, p_value = stats.chisquare(dist, expected)
            
            # Chi-square test compares observed frequencies
            # with expected frequencies (uniform distribution)
            # to determine if the difference is statistically significant.
                    
            ax.set_title(f'{price_type.capitalize()} {position.capitalize()} Digit Distribution\n'
                         f'Chi-square: χ² = {chi2:.4f}, p = {p_value:.4f}')
            ax.set_xlabel('Digit')
            ax.set_ylabel('Frequency')
            ax.legend()
            ax.grid(True, linestyle='--', alpha=0.7)
            
            # Format y-axis as percentage
            ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: '{:.1%}'.format(y)))
            
            # Set x-axis ticks
            if position == 'ones':
                ax.set_xticks(range(1, 10))
                ax.set_xticklabels(range(1, 10))
            else:
                ax.set_xticks(range(10))
                ax.set_xticklabels(range(10))

    plt.tight_layout()
    plt.savefig(f'{save_path}/digit_distributions.png')
    plt.close()

# Usage
df = pd.read_csv('./Data/BankSector_ohlcv.csv')
analyze_digit_distributions(df)