# STATISTICAL ANALYSIS
## Regression to explain bias

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm

In [11]:
periods = ['Q1', 'Q2', 'Q3', 'A1', 'A2']

In [12]:
data = {}
for period in periods:
    data[period] = pd.read_csv(f"results/{period}_rf.csv")

**Hypothesis Testing:**

- After fitting the model, the coefficients and p-values for post_regulation (gamma) and N_analyst (lambda) are extracted.
- The script checks if these coefficients are negative and if their p-values are less than 0.05 (indicating statistical significance).
- It then prints whether each coefficient is both negative and statistically significant.

## Regression for regulation of nov 2000

In [25]:
results = {}
for period in periods:
    data[period].rename(columns={'numest':'N_analyst'}, inplace=True)
    data[period]['post_regulation'] = np.where(data[period].Date> '2000-10', 1, 0)

    # alpha_i is the average bias_AF_ML for each permno.
    data[period]['alpha_i'] = data[period].groupby('permno')['bias_AF_ML'].transform('mean')
    
    # beta_t is the average bias_AF_ML for each Date.
    data[period]['beta_t'] = data[period].groupby('Date')['bias_AF_ML'].transform('mean')

    # Define the independent variables
    X = data[period][['alpha_i', 'beta_t', 'post_regulation']]
    # X = sm.add_constant(X)  # No: fixed aprte conisdered in alpha and beta 
    
    # Define the dependent variable
    y = data[period]['bias_AF_ML']
    
    # Fit the regression model
    model = sm.OLS(y, X).fit()
    
    # Save the results
    results[period] = model.summary()
    
    # Print summary for each period
    print(f"Summary for {period}:")
    print(model.summary())

    # Hypothesis tests for gamma and lambda
    gamma_pvalue = model.pvalues['post_regulation']
    
    gamma_coefficient = model.params['post_regulation']
    
    
    print(f"\nHypothesis tests for {period}:")
    print(f"gamma (post_regulation) coefficient: {gamma_coefficient}, p-value: {gamma_pvalue}")
    
    gamma_significant = gamma_pvalue < 0.05
    
    print(f"Is gamma significant? {'Yes' if gamma_significant  else 'No'}")


Summary for Q1:
                                 OLS Regression Results                                
Dep. Variable:             bias_AF_ML   R-squared (uncentered):                   0.297
Model:                            OLS   Adj. R-squared (uncentered):              0.297
Method:                 Least Squares   F-statistic:                          1.253e+05
Date:                Tue, 21 May 2024   Prob (F-statistic):                        0.00
Time:                        10:53:52   Log-Likelihood:                      1.0742e+06
No. Observations:              888694   AIC:                                 -2.148e+06
Df Residuals:                  888691   BIC:                                 -2.148e+06
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                      coef    std err          t      P>|t|      [0.025      0.975]
--------------------

## Regression for regulation of nov 2000 and N analyst 

In [28]:
results = {}
for period in periods:
    data[period].rename(columns={'numest':'N_analyst'}, inplace=True)
    data[period]['post_regulation'] = np.where(data[period].Date> '2000-10', 1,0 )
    data[period]['alpha_i'] = data[period].groupby('permno')['bias_AF_ML'].transform('mean')
    data[period]['beta_t'] = data[period].groupby('Date')['bias_AF_ML'].transform('mean')

    # Define the independent variables
    X = data[period][['alpha_i', 'beta_t', 'N_analyst', 'post_regulation']]
    
    # Define the dependent variable
    y = data[period]['bias_AF_ML']
    
    # Fit the regression model
    model = sm.OLS(y, X).fit()
    
    # Save the results
    results[period] = model.summary()
    
    # Print summary for each period
    print(f"Summary for {period}:")
    print(model.summary())

    # Hypothesis tests for gamma and lambda
    gamma_pvalue = model.pvalues['post_regulation']
    lambda_pvalue = model.pvalues['N_analyst']
    
    gamma_coefficient = model.params['post_regulation']
    lambda_coefficient = model.params['N_analyst']
    
    print(f"\nHypothesis tests for {period}:")
    print(f"gamma (post_regulation) coefficient: {gamma_coefficient}, p-value: {gamma_pvalue}")
    print(f"lambda (N_analyst) coefficient: {lambda_coefficient}, p-value: {lambda_pvalue}")
    
    gamma_significant = gamma_pvalue < 0.05
    lambda_significant = lambda_pvalue < 0.05
    
    print(f"Is gamma significant? {'Yes' if gamma_significant  else 'No'}")
    print(f"Is lambda significant? {'Yes' if lambda_significant else 'No'}\n")



Summary for Q1:
                                 OLS Regression Results                                
Dep. Variable:             bias_AF_ML   R-squared (uncentered):                   0.298
Model:                            OLS   Adj. R-squared (uncentered):              0.298
Method:                 Least Squares   F-statistic:                          9.414e+04
Date:                Tue, 21 May 2024   Prob (F-statistic):                        0.00
Time:                        11:01:52   Log-Likelihood:                      1.0744e+06
No. Observations:              888694   AIC:                                 -2.149e+06
Df Residuals:                  888690   BIC:                                 -2.149e+06
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                      coef    std err          t      P>|t|      [0.025      0.975]
--------------------