<a href="https://colab.research.google.com/github/ThalyaGIT/UK-Music-Index-Returns/blob/main/3_data-analysis_notebooks/UK_Music_Happiness_and_Index_Returns.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import packages
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from tabulate import tabulate
from scipy import stats


In [None]:
# Open CSV into dataframe
url_1_day = 'https://raw.githubusercontent.com/ThalyaGIT/UK-Music-Index-Returns/main/0-data-gold/data_1_days.csv'
url_3_day = 'https://raw.githubusercontent.com/ThalyaGIT/UK-Music-Index-Returns/main/0-data-gold/data_3_days.csv'
url_5_day = 'https://raw.githubusercontent.com/ThalyaGIT/UK-Music-Index-Returns/main/0-data-gold/data_5_days.csv'
url_10_day = 'https://raw.githubusercontent.com/ThalyaGIT/UK-Music-Index-Returns/main/0-data-gold/data_10_days.csv'
url_20_day = 'https://raw.githubusercontent.com/ThalyaGIT/UK-Music-Index-Returns/main/0-data-gold/data_20_days.csv'

df_1_day = pd.read_csv(url_1_day)
df_3_day = pd.read_csv(url_3_day)
df_5_day = pd.read_csv(url_5_day)
df_10_day = pd.read_csv(url_10_day)
df_20_day = pd.read_csv(url_20_day)

import pandas as pd

# Assuming the data has been loaded into the following DataFrames
# df_1_day = pd.read_csv(url_1_day)
# df_3_day = pd.read_csv(url_3_day)
# df_5_day = pd.read_csv(url_5_day)
# df_10_day = pd.read_csv(url_10_day)
# df_20_day = pd.read_csv(url_20_day)

# List of DataFrames
dataframes = {
    "1 Day": df_1_day,
    "3 Day": df_3_day,
    "5 Day": df_5_day,
    "10 Day": df_10_day,
    "20 Day": df_20_day
}

# Function to filter and print the middle 90% of "Change in SWAV"
def filter_middle_90_percent(df, name):
    lower_bound = df['Change in SWAV'].quantile(0.05)  # 5th percentile
    upper_bound = df['Change in SWAV'].quantile(0.95)  # 95th percentile

    # Filter the DataFrame to keep only the middle 90% of data
    middle_90_df = df[(df['Change in SWAV'] >= lower_bound) & (df['Change in SWAV'] <= upper_bound)]

    print(f"{name}:")
    print(f"  Lower Bound (5th percentile) of Change in SWAV: {lower_bound}")
    print(f"  Upper Bound (95th percentile) of Change in SWAV: {upper_bound}")
    print(f"  Number of rows in the middle 90%: {len(middle_90_df)}")
    print("")

# Loop through each DataFrame and filter the middle 90% of data
for name, df in dataframes.items():
    filter_middle_90_percent(df, name)

# **Main**

In [None]:
# @title
## Main Script

# Initialize an empty list to store results
results = []

indices = ['FTSE100', 'MSCIUK', 'FTSEAllShare', 'FTSE250', 'FTSESmallCap', 'FTSEAIM']
days_list = [1, 3, 5, 10, 20]

for days in days_list:
    result_row = [days]  # Start the row with the number of days
    for index in indices:
        df = globals()[f'df_{days}_day']  # Dynamically access each DataFrame

        # Ensure 'Date' column is in datetime format
        df['Date'] = pd.to_datetime(df['Date'])

        # Extract the month from the 'Date' column
        df['Month'] = df['Date'].dt.month

        # Create dummy variables for the months
        month_dummies = pd.get_dummies(df['Month'], prefix='Month', drop_first=True)

        # Convert boolean dummy variables to integers
        month_dummies = month_dummies.astype(int)

        # Define the dependent variable
        y = df[f'% {index} Change']

        # Define the independent variables
        X = df[['Change in SWAV',
                'ADS_Change',
                'EPU_Change',
                f'Previous % {index} Change',
                '% MSCI Change',
                'Vix Close',
                'Rolling_Avg_Change_in_DCC']]

        # Add the month dummies to the independent variables
        X = pd.concat([X, month_dummies], axis=1)

        # Convert all columns to numeric, coercing errors to NaN
        X = X.apply(pd.to_numeric, errors='coerce')
        y = pd.to_numeric(y, errors='coerce')

        # Drop rows with any NaN values
        X = X.dropna()
        y = y.loc[X.index]  # Ensure 'y' aligns with 'X' after dropping NaNs

        # Ensure that both X and y are aligned and are purely numeric
        if X.shape[0] > 0 and y.shape[0] > 0:  # Proceed only if there's valid data
            # Add a constant term to the model
            X = sm.add_constant(X)

            # Fit the model
            model = sm.OLS(y, X).fit()

            # Extract the coefficient and p-value for 'Change in SWAV'
            coef = round(model.params['Change in SWAV'], 2)
            p_value = round(model.pvalues['Change in SWAV'], 5)

            # Store the coefficient and p-value as a tuple
            result_row.append((coef, p_value))
        else:
            result_row.append((None, None))  # Store None for both if no valid data

    # Append the result row for this combination of days
    results.append(result_row)

# Define column names dynamically, ensuring "Days" is the first column
columns = ['Days']
for index in indices:
    columns.extend([f'{index} Coef'])  # Ensure you have columns for coefficients only

# Convert the results list to a DataFrame, extracting only the coefficients
results_df = pd.DataFrame([[row[0]] + [r[0] if isinstance(r, tuple) else None for r in row[1:]] for row in results], columns=columns)

# Define a function to apply the styling based on significance
def color_rows(row, original_results):
    colors = []
    for i in range(1, len(row)):  # Skip Days, then iterate through Coefs
        # Safely access the original tuple
        item = original_results[row.name][i]
        if isinstance(item, tuple):
            coef, p_value = item
            if coef is not None and p_value < 0.1:  # Only color if p-value < 0.1 (significant)
                if coef > 0:
                    colors.append('background-color: green')
                elif coef < 0:
                    colors.append('background-color: red')
                else:
                    colors.append('')
            else:
                colors.append('')  # No color for non-significant or None
        else:
            colors.append('')  # No color if item is not a tuple
    return [''] * 1 + colors  # No coloring for Days

# Apply the function to each row of the DataFrame, passing the original results
styled_df = results_df.style.apply(color_rows, axis=1, original_results=results)

# Display the styled DataFrame
styled_df

# **TOP 5 HOLDINGS**

In [None]:
# @title
## Top 5 Holdings

# Initialize an empty list to store results
results = []

indices = ['Barc', 'Voda', 'Glen', 'LLoyds', 'BP']
days_list = [1, 3, 5, 10, 20]

for days in days_list:
    result_row = [days]  # Start the row with the number of days
    for index in indices:
        df = globals()[f'df_{days}_day']  # Dynamically access each DataFrame

        # Ensure 'Date' column is in datetime format
        df['Date'] = pd.to_datetime(df['Date'])

        # Extract the month from the 'Date' column
        df['Month'] = df['Date'].dt.month

        # Create dummy variables for the months
        month_dummies = pd.get_dummies(df['Month'], prefix='Month', drop_first=True)

        # Convert boolean dummy variables to integers
        month_dummies = month_dummies.astype(int)

        # Define the dependent variable
        y = df[f'% {index} Change']

        # Define the independent variables
        X = df[['Change in SWAV',
                'ADS_Change',
                'EPU_Change',
                f'Previous % {index} Change',
                '% MSCI Change',
                'Vix Close',
                'Rolling_Avg_Change_in_DCC']]

        # Add the month dummies to the independent variables
        X = pd.concat([X, month_dummies], axis=1)

        # Convert all columns to numeric, coercing errors to NaN
        X = X.apply(pd.to_numeric, errors='coerce')
        y = pd.to_numeric(y, errors='coerce')

        # Drop rows with any NaN values
        X = X.dropna()
        y = y.loc[X.index]  # Ensure 'y' aligns with 'X' after dropping NaNs

        # Ensure that both X and y are aligned and are purely numeric
        if X.shape[0] > 0 and y.shape[0] > 0:  # Proceed only if there's valid data
            # Add a constant term to the model
            X = sm.add_constant(X)

            # Fit the model
            model = sm.OLS(y, X).fit()

            # Extract the coefficient and p-value for 'Change in SWAV'
            coef = round(model.params['Change in SWAV'], 2)
            p_value = round(model.pvalues['Change in SWAV'], 5)

            # Store the coefficient and p-value as a tuple
            result_row.append((coef, p_value))
        else:
            result_row.append((None, None))  # Store None for both if no valid data

    # Append the result row for this combination of days
    results.append(result_row)

# Define column names dynamically, ensuring "Days" is the first column
columns = ['Days']
for index in indices:
    columns.extend([f'{index} Coef'])  # Ensure you have columns for coefficients only

# Convert the results list to a DataFrame, extracting only the coefficients
results_df = pd.DataFrame([[row[0]] + [r[0] if isinstance(r, tuple) else None for r in row[1:]] for row in results], columns=columns)

# Define a function to apply the styling based on significance
def color_rows(row, original_results):
    colors = []
    for i in range(1, len(row)):  # Skip Days, then iterate through Coefs
        # Safely access the original tuple
        item = original_results[row.name][i]
        if isinstance(item, tuple):
            coef, p_value = item
            if coef is not None and p_value < 0.1:  # Only color if p-value < 0.1 (significant)
                if coef > 0:
                    colors.append('background-color: green')
                elif coef < 0:
                    colors.append('background-color: red')
                else:
                    colors.append('')
            else:
                colors.append('')  # No color for non-significant or None
        else:
            colors.append('')  # No color if item is not a tuple
    return [''] * 1 + colors  # No coloring for Days

# Apply the function to each row of the DataFrame, passing the original results
styled_df = results_df.style.apply(color_rows, axis=1, original_results=results)

# Display the styled DataFrame
styled_df

# **No Extreme FTSE Change Values**

In [None]:
# Initialize an empty list to store results
results = []

indices = ['FTSE100', 'MSCIUK', 'FTSEAllShare', 'FTSE250', 'FTSESmallCap', 'FTSEAIM']
days_list = [1, 3, 5, 10, 20]

for days in days_list:
    result_row = [days]  # Start the row with the number of days
    for index in indices:
        df = globals()[f'df_{days}_day']  # Dynamically access each DataFrame

        # Ensure 'Date' column is in datetime format
        df['Date'] = pd.to_datetime(df['Date'])

        # Extract the month from the 'Date' column
        df['Month'] = df['Date'].dt.month

        # Create dummy variables for the months
        month_dummies = pd.get_dummies(df['Month'], prefix='Month', drop_first=True)

        # Convert boolean dummy variables to integers
        month_dummies = month_dummies.astype(int)

        # Define the dependent variable
        y = df[f'% {index} Change']

        # Calculate Z-scores for the percentage change in the index and create a new column safely
        y_z = np.abs(stats.zscore(y))

        # Filter for rows where the absolute Z-score is below the threshold (e.g., |Z| < 3)
        no_extremes_mask = y_z < 3
        y = y.loc[no_extremes_mask]
        X = df.loc[no_extremes_mask, ['Change in SWAV',
                                      'ADS_Change',
                                      'EPU_Change',
                                      f'Previous % {index} Change',
                                      '% MSCI Change',
                                      'Vix Close',
                                      'Rolling_Avg_Change_in_DCC']]

        # Add the month dummies to the independent variables
        X = pd.concat([X, month_dummies.loc[no_extremes_mask]], axis=1)

        # Convert all columns to numeric, coercing errors to NaN
        X = X.apply(pd.to_numeric, errors='coerce')
        y = pd.to_numeric(y, errors='coerce')

        # Drop rows with any NaN values
        X = X.dropna()
        y = y.loc[X.index]  # Ensure 'y' aligns with 'X' after dropping NaNs

        # Ensure that both X and y are aligned and are purely numeric
        if X.shape[0] > 0 and y.shape[0] > 0:  # Proceed only if there's valid data
            # Add a constant term to the model
            X = sm.add_constant(X)

            # Fit the model
            model = sm.OLS(y, X).fit()

            # Extract the coefficient and p-value for 'Change in SWAV'
            coef = round(model.params['Change in SWAV'], 2)
            p_value = round(model.pvalues['Change in SWAV'], 5)

            # Store the coefficient and p-value as a tuple
            result_row.append((coef, p_value))
        else:
            result_row.append((None, None))  # Store None for both if no valid data

    # Append the result row for this combination of days
    results.append(result_row)

# Define column names dynamically, ensuring "Days" is the first column
columns = ['Days']
for index in indices:
    columns.extend([f'{index} Coef'])  # Ensure you have columns for coefficients only

# Convert the results list to a DataFrame, extracting only the coefficients
results_df = pd.DataFrame([[row[0]] + [r[0] if isinstance(r, tuple) else None for r in row[1:]] for row in results], columns=columns)

# Define a function to apply the styling based on significance
def color_rows(row, original_results):
    colors = []
    for i in range(1, len(row)):  # Skip Days, then iterate through Coefs
        # Safely access the original tuple
        item = original_results[row.name][i]
        if isinstance(item, tuple):
            coef, p_value = item
            if coef is not None and p_value < 0.1:  # Only color if p-value < 0.1 (significant)
                if coef > 0:
                    colors.append('background-color: green')
                elif coef < 0:
                    colors.append('background-color: red')
                else:
                    colors.append('')
            else:
                colors.append('')  # No color for non-significant or None
        else:
            colors.append('')  # No color if item is not a tuple
    return [''] * 1 + colors  # No coloring for Days

# Apply the function to each row of the DataFrame, passing the original results
styled_df = results_df.style.apply(color_rows, axis=1, original_results=results)

# Display the styled DataFrame
styled_df

In [None]:
# Initialize an empty list to store results
results = []

indices = ['FTSE100', 'MSCIUK', 'FTSEAllShare', 'FTSE250', 'FTSESmallCap', 'FTSEAIM']
days_list = [1, 3, 5, 10, 20]

for days in days_list:
    result_row = [days]  # Start the row with the number of days
    for index in indices:
        df = globals()[f'df_{days}_day']  # Dynamically access each DataFrame

        # Ensure 'Date' column is in datetime format
        df['Date'] = pd.to_datetime(df['Date'])

        # Extract the month from the 'Date' column
        df['Month'] = df['Date'].dt.month

        # Create dummy variables for the months
        month_dummies = pd.get_dummies(df['Month'], prefix='Month', drop_first=True)

        # Convert boolean dummy variables to integers
        month_dummies = month_dummies.astype(int)

        # Define the dependent variable
        y = df[f'% {index} Change']

        # Define the independent variables
        X = df[['Change in SWAV',
                'ADS_Change',
                'EPU_Change',
                f'Previous % {index} Change',
                '% MSCI Change',
                'Vix Close',
                'Rolling_Avg_Change_in_DCC']]

        # Calculate the 90th percentile for "Change in SWAV"
        upper_bound = X['Change in SWAV'].quantile(0.3)

        # Filter for rows where "Change in SWAV" is above the 90th percentile (top 10% of the highest changes)
        top_90_mask = X['Change in SWAV'] > upper_bound
        X = X[top_90_mask]
        y = y.loc[top_90_mask]

        # Add the month dummies to the independent variables
        X = pd.concat([X, month_dummies.loc[top_90_mask]], axis=1)

        # Convert all columns to numeric, coercing errors to NaN
        X = X.apply(pd.to_numeric, errors='coerce')
        y = pd.to_numeric(y, errors='coerce')

        # Drop rows with any NaN values
        X = X.dropna()
        y = y.loc[X.index]  # Ensure 'y' aligns with 'X' after dropping NaNs

        # Ensure that both X and y are aligned and are purely numeric
        if X.shape[0] > 0 and y.shape[0] > 0:  # Proceed only if there's valid data
            # Add a constant term to the model
            X = sm.add_constant(X)

            # Fit the model
            model = sm.OLS(y, X).fit()

            # Extract the coefficient and p-value for 'Change in SWAV'
            coef = round(model.params['Change in SWAV'], 2)
            p_value = round(model.pvalues['Change in SWAV'], 5)

            # Store the coefficient and p-value as a tuple
            result_row.append((coef, p_value))
        else:
            result_row.append((None, None))  # Store None for both if no valid data

    # Append the result row for this combination of days
    results.append(result_row)

# Define column names dynamically, ensuring "Days" is the first column
columns = ['Days']
for index in indices:
    columns.extend([f'{index} Coef'])  # Ensure you have columns for coefficients only

# Convert the results list to a DataFrame, extracting only the coefficients
results_df = pd.DataFrame([[row[0]] + [r[0] if isinstance(r, tuple) else None for r in row[1:]] for row in results], columns=columns)

# Define a function to apply the styling based on significance
def color_rows(row, original_results):
    colors = []
    for i in range(1, len(row)):  # Skip Days, then iterate through Coefs
        # Safely access the original tuple
        item = original_results[row.name][i]
        if isinstance(item, tuple):
            coef, p_value = item
            if coef is not None and p_value < 0.1:  # Only color if p-value < 0.1 (significant)
                if coef > 0:
                    colors.append('background-color: green')
                elif coef < 0:
                    colors.append('background-color: red')
                else:
                    colors.append('')
            else:
                colors.append('')  # No color for non-significant or None
        else:
            colors.append('')  # No color if item is not a tuple
    return [''] * 1 + colors  # No coloring for Days

# Apply the function to each row of the DataFrame, passing the original results
styled_df = results_df.style.apply(color_rows, axis=1, original_results=results)

# Display the styled DataFrame
styled_df