In [1]:
import numpy as np
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
import matplotlib.pyplot as plt
import pandas as pd
from linearmodels.panel import PanelOLS

In [2]:
import os
# Get the current working directory
current_dir = os.getcwd()
# Navigate to the parent folder
parent_dir = os.path.dirname(current_dir)


In [3]:
file_path = os.path.join(parent_dir, 'Final_ML_Table.csv')
ml_df = pd.read_csv(file_path)

In [4]:
# Dropping rows where imputation is not possible
clean_df = ml_df.dropna(subset=['ACTUAL', 'ggroup', 'Next_quarter_earnings', 'Earnings_Surprise', 'mkvaltq']).copy()
# Imputing mean for the rest of the missing values for columns atq and ibq based on the ggroup
clean_df.loc[:,['atq', 'ibq']] = clean_df.groupby('ggroup')[['atq', 'ibq']].transform(lambda x: x.fillna(x.mean()))
# Computing market cap
clean_df['market_cap'] = clean_df['cshoc'] * clean_df['prccd']
# Scaling next quarter earnings by market cap
clean_df.loc[:,'Scaled_nextq_earnings'] = clean_df.loc[:,'Next_quarter_earnings'] / clean_df.loc[:,'market_cap']
# Creating an interaction term between positive and negative sentiment and buzzscore
clean_df.loc[:,'Positive_buzzscore'] = clean_df.loc[:,'Positive'] * clean_df.loc[:,'buzzscore']
clean_df.loc[:,'Negative_buzzscore'] = clean_df.loc[:,'Negative'] * clean_df.loc[:,'buzzscore']
# Creating an interaction term between buzzscore and Positive and Negative Counts
clean_df.loc[:,'Positive_buzzscore_count'] = clean_df.loc[:,'Positive_count'] * clean_df.loc[:,'buzzscore']
clean_df.loc[:,'Negative_buzzscore_count'] = clean_df.loc[:,'Negative_count'] * clean_df.loc[:,'buzzscore']
# Creating all the same interaction terms but with buzzcount instead of buzzscore
clean_df.loc[:,'Positive_buzzcount'] = clean_df.loc[:,'Positive'] * clean_df.loc[:,'buzzcount']
clean_df.loc[:,'Negative_buzzcount'] = clean_df.loc[:,'Negative'] * clean_df.loc[:,'buzzcount']
clean_df.loc[:,'Positive_buzzcount_count'] = clean_df.loc[:,'Positive_count'] * clean_df.loc[:,'buzzcount']
clean_df.loc[:,'Negative_buzzcount_count'] = clean_df.loc[:,'Negative_count'] * clean_df.loc[:,'buzzcount']

In [5]:
# Indexing based on quarter and year for the FE model
clean_df['datacqtr'] = pd.PeriodIndex(clean_df['datacqtr'], freq='Q')
# Remove 2009Q1 and 2022Q2 data
clean_df = clean_df[(clean_df['datacqtr'] != '2009Q1') & (clean_df['datacqtr'] != '2022Q2')]
# Sorting the dataframe based on gvkey and datacqtr
clean_df.set_index(['gvkey', 'datacqtr'], inplace=True)
# Creating dummy variables
clean_df = pd.get_dummies(clean_df, columns=['ggroup', 'Sentiment'], drop_first=True)
# more interaction terms
clean_df['dom_negative_buzzscore'] = clean_df['Sentiment_Negative'] * clean_df['buzzscore']
clean_df['dom_positive_buzzscore'] = clean_df['Sentiment_Positive'] * clean_df['buzzscore']
clean_df['dom_negative_buzzcount'] = clean_df['Sentiment_Negative'] * clean_df['buzzcount']
clean_df['dom_positive_buzzcount'] = clean_df['Sentiment_Positive'] * clean_df['buzzcount']
# Interaction with total tokens
clean_df['Total_tokens_buzzscore'] = clean_df['total_tokens'] * clean_df['buzzscore']
clean_df['Total_tokens_buzzcount'] = clean_df['total_tokens'] * clean_df['buzzcount']

In [11]:
# Creating the dependent and independent variables
dependent_vars_1 = ['Next_quarter_earnings', 'Next_Quarter_Open_unadj']
exog_vars_1 = ['buzzscore', 'atq', 'ibq', 'Earnings_Surprise', 'nonadjclose', 'Positive_buzzscore', 'Positive', 'Negative_buzzscore',
              'Negative', 'Uncertainty', 'Litigious', 'Constraining', 'Weak_Modal', 'Strong_Modal', 'total_tokens', 'cshoc']
exog_vars_2 = ['buzzscore', 'atq', 'ibq', 'Earnings_Surprise', 'nonadjclose', 'Positive_buzzscore_count', 'Positive_count',
                'Negative_buzzscore_count', 'Negative_count', 'Uncertainty_count', 'total_tokens', 'cshoc']
exog_vars_3 = ['buzzcount', 'atq', 'ibq', 'Earnings_Surprise', 'nonadjclose', 'Positive_buzzcount', 'Positive', 'Negative_buzzcount',
              'Negative', 'Uncertainty', 'Litigious', 'Constraining', 'Weak_Modal', 'Strong_Modal', 'total_tokens', 'cshoc']
exog_vars_4 = ['buzzcount', 'atq', 'ibq', 'Earnings_Surprise', 'nonadjclose', 'Positive_buzzcount_count', 'Positive_count',
                'Negative_buzzcount_count', 'Negative_count', 'Uncertainty_count', 'total_tokens', 'cshoc']
# Two more models with different exog variables namely dom_negative_buzzscore, dom_positive_buzzscore, dom_negative_buzzcount, 
# dom_positive_buzzcount and the interaction terms with buzzscore and buzzcount
exog_vars_5 = ['buzzscore', 'atq', 'ibq', 'Earnings_Surprise', 'nonadjclose', 'Sentiment_Negative', 'Sentiment_Positive', 
               'dom_negative_buzzscore','dom_positive_buzzscore', 'Sentiment_Strong_Modal', 'Sentiment_Uncertainty', 'total_tokens']
exog_vars_6 = ['buzzcount', 'atq', 'ibq', 'Earnings_Surprise', 'nonadjclose', 'Sentiment_Negative', 'Sentiment_Positive', 
                'dom_negative_buzzcount', 'dom_positive_buzzcount', 'Sentiment_Strong_Modal', 'Sentiment_Uncertainty', 'total_tokens']
# Another two exog vars set ups for control
exog_vars_7 = ['buzzscore', 'atq', 'ibq', 'Earnings_Surprise', 'nonadjclose', 'Total_tokens_buzzscore', 'total_tokens']
exog_vars_8 = ['buzzcount', 'atq', 'ibq', 'Earnings_Surprise', 'nonadjclose', 'Total_tokens_buzzcount', 'total_tokens']


# The industry groupings are perfectly explained by the other variables used for FE model
temp = ['ggroup_Banks', 'ggroup_Capital Goods', 'ggroup_Commercial & Professional Services',
             'ggroup_Consumer Durables & Apparel', 'ggroup_Consumer Services', 'ggroup_Diversified Financials',
             'ggroup_Energy', 'ggroup_Food & Staples Retailing', 'ggroup_Food, Beverage & Tobacco', 
             'ggroup_Health Care Equipment & Services', 'ggroup_Household & Personal Products', 'ggroup_Insurance',
             'ggroup_Materials', 'ggroup_Media & Entertainment', 'ggroup_Pharmaceuticals, Biotechnology & Life Sciences',
             'ggroup_Real Estate', 'ggroup_Retailing', 'ggroup_Software & Services', 'ggroup_Technology Hardware & Equipment',
             'ggroup_Semiconductors & Semiconductor Equipment', 'ggroup_Telecommunication Services', 'ggroup_Utilities',
             'ggroup_Transportation']

In [7]:
# Drop rows with 'NaT' in the index
clean_df = clean_df[clean_df.index.get_level_values('datacqtr').notna()]

In [8]:
# Convert 'datacqtr' index to a period index with a quarterly frequency
clean_df.index = clean_df.index.set_levels([clean_df.index.levels[0], clean_df.index.levels[1].start_time])

Model with buzzscore and percentage sentiment

In [12]:
for dep_var in dependent_vars_1:
    model = PanelOLS(clean_df[dep_var], clean_df[exog_vars_1], entity_effects=True, drop_absorbed=True)
    results = model.fit(cov_type='robust')
    print(f'Results for {dep_var}:\n')
    print(results.summary)
    print('\n\n')

Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


Results for Next_quarter_earnings:

                            PanelOLS Estimation Summary                            
Dep. Variable:     Next_quarter_earnings   R-squared:                        0.2015
Estimator:                      PanelOLS   R-squared (Between):              0.4046
No. Observations:                  89126   R-squared (Within):               0.2015
Date:                   Thu, Jun 13 2024   R-squared (Overall):              0.3854
Time:                           14:21:06   Log-likelihood                -6.621e+05
Cov. Estimator:                   Robust                                           
                                           F-statistic:                      1426.8
Entities:                           4296   P-value                           0.0000
Avg Obs:                          20.746   Distribution:                F(15,84815)
Min Obs:                          1.0000                                           
Max Obs:                          101.00

Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


Results for Next_Quarter_Open_unadj:

                             PanelOLS Estimation Summary                             
Dep. Variable:     Next_Quarter_Open_unadj   R-squared:                        0.8975
Estimator:                        PanelOLS   R-squared (Between):              0.9929
No. Observations:                    89060   R-squared (Within):               0.8975
Date:                     Thu, Jun 13 2024   R-squared (Overall):              0.9612
Time:                             14:21:07   Log-likelihood                -3.772e+05
Cov. Estimator:                     Robust                                           
                                             F-statistic:                   4.947e+04
Entities:                             4294   P-value                           0.0000
Avg Obs:                            20.741   Distribution:                F(15,84751)
Min Obs:                            1.0000                                           
Max Obs:        

Model with buzzscore and count sentiment

In [10]:
for dep_var in dependent_vars_1:
    model = PanelOLS(clean_df[dep_var], clean_df[exog_vars_2], entity_effects=True, drop_absorbed=True)
    results = model.fit(cov_type='robust')
    print(f'Results for {dep_var}:\n')
    print(results.summary)
    print('\n\n')

Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


Results for Next_quarter_earnings:

                            PanelOLS Estimation Summary                            
Dep. Variable:     Next_quarter_earnings   R-squared:                        0.2018
Estimator:                      PanelOLS   R-squared (Between):              0.3831
No. Observations:                  89126   R-squared (Within):               0.2018
Date:                   Wed, Jun 12 2024   R-squared (Overall):              0.3894
Time:                           00:51:29   Log-likelihood                -6.621e+05
Cov. Estimator:                   Robust                                           
                                           F-statistic:                      1786.8
Entities:                           4296   P-value                           0.0000
Avg Obs:                          20.746   Distribution:                F(12,84818)
Min Obs:                          1.0000                                           
Max Obs:                          101.00

Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


Results for Next_Quarter_Open_unadj:

                             PanelOLS Estimation Summary                             
Dep. Variable:     Next_Quarter_Open_unadj   R-squared:                        0.8975
Estimator:                        PanelOLS   R-squared (Between):              0.9944
No. Observations:                    89060   R-squared (Within):               0.8975
Date:                     Wed, Jun 12 2024   R-squared (Overall):              0.9692
Time:                             00:51:30   Log-likelihood                -3.772e+05
Cov. Estimator:                     Robust                                           
                                             F-statistic:                   6.182e+04
Entities:                             4294   P-value                           0.0000
Avg Obs:                            20.741   Distribution:                F(12,84754)
Min Obs:                            1.0000                                           
Max Obs:        

Model with buzzcount and percentage sentiment

In [13]:
for dep_var in dependent_vars_1:
    model = PanelOLS(clean_df[dep_var], clean_df[exog_vars_3], entity_effects=True, drop_absorbed=True)
    results = model.fit(cov_type='robust')
    print(f'Results for {dep_var}:\n')
    print(results.summary)
    print('\n\n')

Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


Results for Next_quarter_earnings:

                            PanelOLS Estimation Summary                            
Dep. Variable:     Next_quarter_earnings   R-squared:                        0.2017
Estimator:                      PanelOLS   R-squared (Between):              0.4066
No. Observations:                  89126   R-squared (Within):               0.2017
Date:                   Thu, Jun 13 2024   R-squared (Overall):              0.3866
Time:                           14:21:21   Log-likelihood                -6.621e+05
Cov. Estimator:                   Robust                                           
                                           F-statistic:                      1428.3
Entities:                           4296   P-value                           0.0000
Avg Obs:                          20.746   Distribution:                F(15,84815)
Min Obs:                          1.0000                                           
Max Obs:                          101.00

Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


Results for Next_Quarter_Open_unadj:

                             PanelOLS Estimation Summary                             
Dep. Variable:     Next_Quarter_Open_unadj   R-squared:                        0.8975
Estimator:                        PanelOLS   R-squared (Between):              0.9929
No. Observations:                    89060   R-squared (Within):               0.8975
Date:                     Thu, Jun 13 2024   R-squared (Overall):              0.9612
Time:                             14:21:22   Log-likelihood                -3.772e+05
Cov. Estimator:                     Robust                                           
                                             F-statistic:                   4.947e+04
Entities:                             4294   P-value                           0.0000
Avg Obs:                            20.741   Distribution:                F(15,84751)
Min Obs:                            1.0000                                           
Max Obs:        

Model with buzzcount and count sentiment

In [12]:
for dep_var in dependent_vars_1:
    model = PanelOLS(clean_df[dep_var], clean_df[exog_vars_4], entity_effects=True, drop_absorbed=True, )
    results = model.fit(cov_type='robust')
    print(f'Results for {dep_var}:\n')
    print(results.summary)
    print('\n\n')

Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


Results for Next_quarter_earnings:

                            PanelOLS Estimation Summary                            
Dep. Variable:     Next_quarter_earnings   R-squared:                        0.2017
Estimator:                      PanelOLS   R-squared (Between):              0.3803
No. Observations:                  89126   R-squared (Within):               0.2017
Date:                   Wed, Jun 12 2024   R-squared (Overall):              0.3889
Time:                           00:51:33   Log-likelihood                -6.621e+05
Cov. Estimator:                   Robust                                           
                                           F-statistic:                      1786.1
Entities:                           4296   P-value                           0.0000
Avg Obs:                          20.746   Distribution:                F(12,84818)
Min Obs:                          1.0000                                           
Max Obs:                          101.00

Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


Results for Next_Quarter_Open_unadj:

                             PanelOLS Estimation Summary                             
Dep. Variable:     Next_Quarter_Open_unadj   R-squared:                        0.8975
Estimator:                        PanelOLS   R-squared (Between):              0.9944
No. Observations:                    89060   R-squared (Within):               0.8975
Date:                     Wed, Jun 12 2024   R-squared (Overall):              0.9692
Time:                             00:51:33   Log-likelihood                -3.772e+05
Cov. Estimator:                     Robust                                           
                                             F-statistic:                   6.183e+04
Entities:                             4294   P-value                           0.0000
Avg Obs:                            20.741   Distribution:                F(12,84754)
Min Obs:                            1.0000                                           
Max Obs:        

## Extra

In [16]:
for dep_var in dependent_vars_1:
    model = PanelOLS(clean_df[dep_var], clean_df[exog_vars_5], entity_effects=True, drop_absorbed=True)
    results = model.fit()
    print(f'Results for {dep_var}:\n')
    print(results.summary)
    print('\n\n')

Results for Next_quarter_earnings:

                            PanelOLS Estimation Summary                            
Dep. Variable:     Next_quarter_earnings   R-squared:                        0.1736
Estimator:                      PanelOLS   R-squared (Between):              0.5468
No. Observations:                  89149   R-squared (Within):               0.1736
Date:                   Thu, May 30 2024   R-squared (Overall):              0.4581
Time:                           18:52:00   Log-likelihood                -6.638e+05
Cov. Estimator:               Unadjusted                                           
                                           F-statistic:                      1485.5
Entities:                           4296   P-value                           0.0000
Avg Obs:                          20.752   Distribution:                F(12,84841)
Min Obs:                          1.0000                                           
Max Obs:                          101.00

Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


Results for Next_Quarter_Open:

                          PanelOLS Estimation Summary                           
Dep. Variable:      Next_Quarter_Open   R-squared:                        0.8974
Estimator:                   PanelOLS   R-squared (Between):              0.9946
No. Observations:               89083   R-squared (Within):               0.8974
Date:                Thu, May 30 2024   R-squared (Overall):              0.9694
Time:                        18:52:01   Log-likelihood                -3.773e+05
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                   6.176e+04
Entities:                        4294   P-value                           0.0000
Avg Obs:                       20.746   Distribution:                F(12,84777)
Min Obs:                       1.0000                                           
Max Obs:                       101.00   F-statistic (robust):          6.176e

In [17]:
for dep_var in dependent_vars_1:
    model = PanelOLS(clean_df[dep_var], clean_df[exog_vars_6], entity_effects=True, drop_absorbed=True)
    results = model.fit()
    print(f'Results for {dep_var}:\n')
    print(results.summary)
    print('\n\n')

Results for Next_quarter_earnings:

                            PanelOLS Estimation Summary                            
Dep. Variable:     Next_quarter_earnings   R-squared:                        0.1737
Estimator:                      PanelOLS   R-squared (Between):              0.5485
No. Observations:                  89149   R-squared (Within):               0.1737
Date:                   Thu, May 30 2024   R-squared (Overall):              0.4589
Time:                           18:52:05   Log-likelihood                -6.638e+05
Cov. Estimator:               Unadjusted                                           
                                           F-statistic:                      1486.3
Entities:                           4296   P-value                           0.0000
Avg Obs:                          20.752   Distribution:                F(12,84841)
Min Obs:                          1.0000                                           
Max Obs:                          101.00

Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


Results for Next_Quarter_Open:

                          PanelOLS Estimation Summary                           
Dep. Variable:      Next_Quarter_Open   R-squared:                        0.8974
Estimator:                   PanelOLS   R-squared (Between):              0.9946
No. Observations:               89083   R-squared (Within):               0.8974
Date:                Thu, May 30 2024   R-squared (Overall):              0.9695
Time:                        18:52:05   Log-likelihood                -3.773e+05
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                   6.176e+04
Entities:                        4294   P-value                           0.0000
Avg Obs:                       20.746   Distribution:                F(12,84777)
Min Obs:                       1.0000                                           
Max Obs:                       101.00   F-statistic (robust):          6.176e

In [12]:
for dep_var in dependent_vars_1:
    model = PanelOLS(clean_df[dep_var], clean_df[exog_vars_7], entity_effects=True, drop_absorbed=True)
    results = model.fit()
    print(f'Results for {dep_var}:\n')
    print(results.summary)
    print('\n\n')

Results for Next_quarter_earnings:

                            PanelOLS Estimation Summary                            
Dep. Variable:     Next_quarter_earnings   R-squared:                        0.1736
Estimator:                      PanelOLS   R-squared (Between):              0.5474
No. Observations:                  89149   R-squared (Within):               0.1736
Date:                   Sun, Jun 02 2024   R-squared (Overall):              0.4571
Time:                           14:58:55   Log-likelihood                -6.638e+05
Cov. Estimator:               Unadjusted                                           
                                           F-statistic:                      2546.7
Entities:                           4296   P-value                           0.0000
Avg Obs:                          20.752   Distribution:                 F(7,84846)
Min Obs:                          1.0000                                           
Max Obs:                          101.00

Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


Results for Next_Quarter_Open:

                          PanelOLS Estimation Summary                           
Dep. Variable:      Next_Quarter_Open   R-squared:                        0.8973
Estimator:                   PanelOLS   R-squared (Between):              0.9944
No. Observations:               89083   R-squared (Within):               0.8973
Date:                Sun, Jun 02 2024   R-squared (Overall):              0.9693
Time:                        14:58:56   Log-likelihood                -3.774e+05
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                   1.059e+05
Entities:                        4294   P-value                           0.0000
Avg Obs:                       20.746   Distribution:                 F(7,84782)
Min Obs:                       1.0000                                           
Max Obs:                       101.00   F-statistic (robust):          1.059e

In [13]:
for dep_var in dependent_vars_1:
    model = PanelOLS(clean_df[dep_var], clean_df[exog_vars_8], entity_effects=True, drop_absorbed=True)
    results = model.fit()
    print(f'Results for {dep_var}:\n')
    print(results.summary)
    print('\n\n')

Results for Next_quarter_earnings:

                            PanelOLS Estimation Summary                            
Dep. Variable:     Next_quarter_earnings   R-squared:                        0.1736
Estimator:                      PanelOLS   R-squared (Between):              0.5500
No. Observations:                  89149   R-squared (Within):               0.1736
Date:                   Sun, Jun 02 2024   R-squared (Overall):              0.4585
Time:                           14:58:56   Log-likelihood                -6.638e+05
Cov. Estimator:               Unadjusted                                           
                                           F-statistic:                      2545.7
Entities:                           4296   P-value                           0.0000
Avg Obs:                          20.752   Distribution:                 F(7,84846)
Min Obs:                          1.0000                                           
Max Obs:                          101.00

Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


Results for Next_Quarter_Open:

                          PanelOLS Estimation Summary                           
Dep. Variable:      Next_Quarter_Open   R-squared:                        0.8973
Estimator:                   PanelOLS   R-squared (Between):              0.9945
No. Observations:               89083   R-squared (Within):               0.8973
Date:                Sun, Jun 02 2024   R-squared (Overall):              0.9693
Time:                        14:58:57   Log-likelihood                -3.773e+05
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                   1.059e+05
Entities:                        4294   P-value                           0.0000
Avg Obs:                       20.746   Distribution:                 F(7,84782)
Min Obs:                       1.0000                                           
Max Obs:                       101.00   F-statistic (robust):          1.059e