In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import plotly.graph_objects as go
import ipywidgets as widgets
from ipywidgets import interact
from IPython.display import clear_output
from statsmodels.tsa.seasonal import STL

# Load and prepare the data
file_path = 'IBM_2006-01-01_to_2018-01-01.csv'
data = pd.read_csv(file_path)
data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date', inplace=True)
data = data.drop('Name', axis=1)


In [None]:
# Identify missing days 
all_days = pd.date_range(start=data.index.min(), end=data.index.max(), freq='D')

missing_days = all_days.difference(data.index)

missing_days_df = pd.DataFrame(missing_days, columns=['MissingDate'])
missing_days_df['DayOfWeek'] = missing_days_df['MissingDate'].dt.day_name()

print("Missing days with day of the week:")
print(missing_days_df)

pattern_analysis = missing_days_df['DayOfWeek'].value_counts()
print("\nPattern of missing days by day of the week:")
print(pattern_analysis)

In [None]:
# NaN values
data[data.isna().any(axis=1)]

In [None]:
# cleanup
data_resampled = data.asfreq('D')
data_resampled = data_resampled.interpolate(method='time')

# Visualization

In [None]:
# 1. Visualization

plt.figure(figsize=(14, 7))
plt.plot(data['Close'], color='blue', label='Closing Price')
plt.title('IBM Closing Prices Over Time')
plt.xlabel('Date')
plt.ylabel('Closing Price')
plt.legend()
plt.show()

In [None]:
# helper to work on selected dates only
def filter_data_by_date(data, df_callback):
    # Convert the index to integer positions for slider values
    date_range = data.index

    # Define a range slider for selecting the start and end indices
    range_slider = widgets.IntRangeSlider(
        value=[0, len(date_range) - 1],
        min=0,
        max=len(date_range) - 1,
        step=1,
        description='Date Range',
        continuous_update=False
    )

    # Display the slider widget
    display(range_slider)

    # Display the actual date range selected for clarity
    date_display = widgets.Label(
        value=f"Selected Range: {date_range[range_slider.value[0]].strftime('%d-%m-%Y')} to {date_range[range_slider.value[1]].strftime('%d-%m-%Y')}"
    )
    display(date_display)

    # Update the date display when the slider value changes
    def update_date_display(*args):
        date_display.value = f"Selected Range: {date_range[range_slider.value[0]].strftime('%d-%m-%Y')} to {date_range[range_slider.value[1]].strftime('%d-%m-%Y')}"
        start_idx, end_idx = range_slider.value
        filtered_df = data.iloc[start_idx:end_idx + 1]
        clear_output(wait=True)
        display(range_slider, date_display)
        df_callback(filtered_df.copy())

    range_slider.observe(update_date_display, 'value')

In [None]:
# candlesticks and volume

def candlesticks(data):
    # Create a candlestick chart
    fig = go.Figure()

    # Add volume as a bar chart on a secondary y-axis
    fig.add_trace(go.Bar(
        x=data.index,
        y=data['Volume'],
        name='Volume',
        marker=dict(color='rgba(128, 128, 128, 0.5)'),
    ))

    fig.add_trace(go.Candlestick(
        x=data.index,
        open=data['Open'],
        high=data['High'],
        low=data['Low'],
        close=data['Close'],
        name='Candlestick',
        yaxis="y2"
    ))

    # Update layout to include a secondary y-axis for volume
    fig.update_layout(
        title="IBM Stock Candlestick Chart with Volume",
        xaxis_title="Date",
        yaxis_title="Volume",
        yaxis2=dict(
            title="Price",
            overlaying="y",
            side="right"
        ),
        xaxis_rangeslider_visible=False  # Hide the default range slider
    )

    # Show the figure
    fig.show()

filter_data_by_date(data, candlesticks)

# Observations from data

- no obvious patterns (who would have known)
- huge drop 20 oct 2014: https://www.wired.com/2014/10/ibm-globalfoundries/
- abnormal situations often associated with high trade volume

In [None]:
# try finding best period minimizing mse
def find_optimal_period(data, min_period=7, max_period=365):
    mse_values = []
    periods = range(min_period, max_period + 1, 7)

    for period in periods:
        stl = STL(data['Close'], period=period, robust=False)  # Choose 'seasonal' parameter based on testing
        decomposition = stl.fit()

        # decomposition = seasonal_decompose(data['Close'], model='additive', period=period)

        residuals = decomposition.resid
        mse = np.nanmean(residuals ** 2)  # Calculate MSE, ignoring NaN values
        mse_values.append((period, mse))

    # Find the period with the minimum MSE
    best_period = min(mse_values, key=lambda x: x[1])[0]
    fig = go.Figure()

    # Add line plot
    fig.add_trace(go.Scatter(x=list(periods), y=[x[1] for x in mse_values], mode='lines+markers', name='MSE'))

    # Set title and labels
    fig.update_layout(
        title="Mean Squared Error of Residuals vs. Period",
        xaxis_title="Period",
        yaxis_title="Mean Squared Error (MSE)",
        xaxis=dict(showgrid=True, gridcolor="lightgray", gridwidth=0.5),  # Add vertical grid lines
    )
    fig.show()
    return best_period, mse_values

filter_data_by_date(data_resampled, find_optimal_period)


In [None]:
def plot_decomposition(data, period):
    # Set period (e.g., 21 trading days for a month)
    stl = STL(data['Close'], period=period, robust=False)  # Choose 'seasonal' parameter based on testing
    result = stl.fit()

    # Plotting the components
    mse = np.nanmean(result.resid ** 2)  # Calculate MSE, ignoring NaN values
    fig = result.plot()
    fig.set_size_inches(16, 6)
    print(f'MSE: {mse:.2f}')
    plt.show()

def plot_decomposition_interactive(data):
    # Create an interactive slider for period (min, max, step)
    interact(lambda period: plot_decomposition(data, period), period=widgets.IntSlider(min=2, max=365, step=1, value=25))

filter_data_by_date(data_resampled, plot_decomposition_interactive)


In [None]:
def acf_analysis(data):
    # 3. Autocorrelation Analysis
    plt.figure(figsize=(14, 6))
    plt.subplot(1, 2, 1)
    plot_acf(data['Close'], lags=100, ax=plt.gca())
    plt.title('Autocorrelation Function (ACF)')
    plt.subplot(1, 2, 2)
    plot_pacf(data['Close'], lags=100, ax=plt.gca(), method='ywm')
    plt.title('Partial Autocorrelation Function (PACF)')
    plt.tight_layout()
    plt.show()

filter_data_by_date(data_resampled, acf_analysis)
