In [38]:
# Install necessary libraries. 
!pip install pandas
!pip install statsmodels



In [39]:
# Import necessary libraries.
import pandas as pd
from sklearn.linear_model import LinearRegression
from datetime import datetime, timedelta
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm

In [40]:
# Read in the data sets. 
df_jan = pd.read_csv('data/filtered_stocks_23Jan.csv')
df_stock_outstanding = pd.read_csv('data/stock_descriptions_with_marketCap.csv')

In [41]:
# Separating SPY and other stocks. 
stocks_to_remove = ['DIA', 'ONEQ', 'SPY', 'VOO']
filtered_df_jan = df_jan[~df_jan['Ticker_Symbol'].isin(stocks_to_remove)]
filtered_df_jan_spy = df_jan[df_jan['Ticker_Symbol'] == 'SPY']

In [42]:
# rm is the daily market return.
rm_jan = {'Date-Time': filtered_df_jan_spy['Date-Time'], 'RM': (filtered_df_jan_spy['Close'] - filtered_df_jan_spy['Open']) / filtered_df_jan_spy['Open'] * 100}
df_rm_jan = pd.DataFrame(rm_jan)

# rt is the daily return for each stock. 
rt_jan = {'Date-Time': filtered_df_jan['Date-Time'], 'RT': (filtered_df_jan['Close'] - filtered_df_jan['Open']) / filtered_df_jan['Open'] * 100, 'Ticker_Symbol': filtered_df_jan['Ticker_Symbol']}
df_rt_jan = pd.DataFrame(rt_jan)

# Extract different stock symbols. 
unique_symbol_jan = filtered_df_jan['Ticker_Symbol'].unique()
grouped_df_jan = filtered_df_jan.groupby('Ticker_Symbol')

In [43]:
# Calculate the error term in the linear regression. 
all_res = {}

for ticker in unique_symbol_jan:
    each_ticker = df_rt_jan[df_rt_jan['Ticker_Symbol'] == ticker]
    aligned_data = pd.merge(df_rm_jan, each_ticker, on='Date-Time', suffixes=('_SPY', '_Ticker'))
    x = aligned_data['RM'].values.reshape(-1, 1)
    y = aligned_data['RT'].values.reshape(-1, 1)
    
    model = LinearRegression()
    model.fit(x, y)


    y_pred = model.predict(x) 
    #print(y_pred)

    daily_error = y - y_pred
    
    all_res[ticker] = list(zip(aligned_data['Date-Time'], daily_error.flatten()))

for ticker, records in all_res.items():
    print(f"Details for {ticker}:")
    for date, error in records:
        print(f"Date: {date}, Error: {error:.4f}")
    print()

Details for HRL:
Date: 2021-02-23, Error: -0.4663
Date: 2021-02-22, Error: 0.2900
Date: 2021-02-19, Error: -0.8692
Date: 2021-02-18, Error: 0.9867
Date: 2021-02-17, Error: -0.1549
Date: 2021-02-16, Error: -2.1618
Date: 2021-02-12, Error: -0.2499
Date: 2021-02-11, Error: -3.0444
Date: 2021-02-10, Error: 0.2782
Date: 2021-02-09, Error: -0.9762
Date: 2021-02-08, Error: 1.3238
Date: 2021-02-05, Error: -0.2315
Date: 2021-02-04, Error: 1.8084
Date: 2021-02-03, Error: -0.9552
Date: 2021-02-02, Error: 3.6109
Date: 2021-02-01, Error: 0.4387
Date: 2021-01-29, Error: -2.1403
Date: 2021-01-28, Error: -4.0668
Date: 2021-01-27, Error: 4.3662
Date: 2021-01-26, Error: 3.2948
Date: 2021-01-25, Error: 3.5505
Date: 2021-01-22, Error: 0.2961
Date: 2021-01-21, Error: 1.4143
Date: 2021-01-20, Error: -0.1280
Date: 2021-01-19, Error: -0.6214
Date: 2021-01-15, Error: 2.0598
Date: 2021-01-14, Error: -0.5941
Date: 2021-01-13, Error: 0.6535
Date: 2021-01-12, Error: -0.7687
Date: 2021-01-11, Error: -2.2459
Date: 2

In [44]:
def calculate_CAR(errors_with_dates, ticker, event_date_str, k):
    """
    Calculate the Cumulative Abnormal Return (CAR) for a given ticker around an event date,
    considering only the dates present in the errors_with_dates dataset.

    Parameters:
    - errors_with_dates: Dict, with ticker symbols as keys and lists of (date, error) tuples as values.
    - ticker: String, the ticker symbol for which to calculate CAR.
    - event_date_str: String, the event date in 'YYYY-MM-DD' format.
    - k: Integer, the number of days before and after the event date to consider, based on available data.

    Returns:
    - CAR: Float, the Cumulative Abnormal Return for the specified window around the event date.
    """
    event_date = datetime.strptime(event_date_str, '%Y-%m-%d')
    date_errors = sorted(errors_with_dates.get(ticker, []), key=lambda x: datetime.strptime(x[0], '%Y-%m-%d'))
    event_date_indexes = [i for i, date_error in enumerate(date_errors) if date_error[0] == event_date_str]
    
    if not event_date_indexes:
        return 0 
    
    event_date_index = event_date_indexes[0]
    
    start_index = max(0, event_date_index - k)
    end_index = min(len(date_errors) - 1, event_date_index + k)
    CAR = sum(error for _, error in date_errors[start_index:end_index + 1])

    return CAR

In [45]:
# filtered_df_jan is the df without 'DIA', 'ONEQ', 'SPY', 'VOO'
# volatility_jan is the filtered_df_jan with a 'Volatility' column
volatility_jan = filtered_df_jan.copy()
volatility_jan['Volatility'] = volatility_jan['High'] - volatility_jan['Low'] 

In [46]:
# merged_jan only contains columns ['Date-Time', 'Open', 'Close', 'High', 'Low', 'Volume', 'Ticker_Symbol', 'Volatility', 'Sector', 'Industry', '2023']
temp_df_jan = volatility_jan.copy()
merged_jan = pd.merge(temp_df_jan, df_stock_outstanding, left_on='Ticker_Symbol', right_on='Symbol', how='left')
merged_jan = merged_jan[['Date-Time', 'Open', 'Close', 'High', 'Low', 'Volume', 'Ticker_Symbol', 'Volatility', 'Sector', 'Industry', '2021']]

In [47]:
# Turnover of jan (2021)
df_turnover_jan = merged_jan.copy()
df_turnover_jan['Turnover'] = df_turnover_jan['Volume'] / df_turnover_jan['2021']

In [48]:
# Calculate Market Capitalization
df_market_cap_2021 = df_turnover_jan.copy()
df_market_cap_2021['Market_Cap'] = np.log(df_market_cap_2021['Close'] * df_market_cap_2021['2021'])

In [49]:
# Change Inductry to numbers
# Finalised DataFrame df_2021
# 2021
df_2021 = df_market_cap_2021.copy()
df_2021['Industry_name'] = df_2021['Industry']
df_2021['Industry'] = df_2021['Industry'].astype('category')
df_2021['Industry'] = df_2021['Industry'].cat.codes

In [50]:
# event date 2021-01-25
temp_df_2021 = df_2021.copy()

features_list = []  # To store features for each ticker

for ticker in temp_df_2021['Ticker_Symbol'].unique():
    selected_rows = temp_df_2021[temp_df_2021['Ticker_Symbol'] == ticker]
    selected_row = selected_rows[selected_rows['Date-Time'] == '2021-01-25']
    
    y_value = calculate_CAR(all_res, ticker, '2021-01-25', 7)
    if not selected_row.empty: 
        X = selected_row[['Industry', 'Turnover', 'Market_Cap', 'Close', 'Volatility']]
        X = pd.get_dummies(X, columns=['Industry'], drop_first=True)
        features = X.iloc[0].values 
        features_list.append(np.append(features, y_value)) 

df_features = pd.DataFrame(features_list)
X = df_features.iloc[:, :-1]
y = df_features.iloc[:, -1]
X = sm.add_constant(X)

model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      4   R-squared:                       0.044
Model:                            OLS   Adj. R-squared:                 -0.148
Method:                 Least Squares   F-statistic:                    0.2276
Date:                Sat, 23 Mar 2024   Prob (F-statistic):              0.920
Time:                        23:09:47   Log-Likelihood:                -72.920
No. Observations:                  25   AIC:                             155.8
Df Residuals:                      20   BIC:                             161.9
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         11.9419     18.623      0.641      0.5