In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from scipy.stats import chi2_contingency, ttest_ind, shapiro
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.graphics.gofplots import qqplot
from statsmodels.stats.stattools import durbin_watson
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.iolib.summary2 import summary_col
from statsmodels.tools.tools import add_constant
from sklearn.preprocessing import StandardScaler  # Ensure this is imported
import os  # For folder creation
import ast  # For safely evaluating string representations of lists

In [None]:
# Define the file path to your dataset
filepath = "D:/daten_masterarbeit/final_dataset_reg_full.csv"

# Read the CSV file
df = pd.read_csv(filepath)

print(f"Number of observations in the final_dataset: {len(df)}")

#%% Data Preparation

# List of variables to include in the analysis
variables = [
    'similarity_to_overall_average',
    'similarity_to_industry_average',
    'similarity_to_company_average',
    'excess_ret_immediate',
    'excess_ret_short_term',
    'excess_ret_medium_term',
    'excess_ret_long_term',
    'epsfxq',
    'epsfxq_next',
    'length_participant_questions',  # Dependent Variable
    'length_management_answers',    # Dependent Variable
    'market_cap',                   # Control Variable
    'rolling_beta',                 # Control Variable
    'ceo_participates',             # Control Variable
    'ceo_cfo_change',               # Control Variable
    'word_length_presentation',     # Control Variable
    'participant_question_topics',  # For Chi-Squared Test
    'management_answer_topics',     # For Chi-Squared Test
    'filtered_presentation_topics'  # For topic diversity
]


Number of observations in the final_dataset: 50073
Number of observations after dropping NaNs: 41500


In [17]:

# Ensure all variables exist in the DataFrame
missing_vars = [var for var in variables if var not in df.columns]
if missing_vars:
    raise KeyError(f"The following required columns are missing from the DataFrame: {missing_vars}")

# Create analysis DataFrame with the specified variables
analysis_df = df[variables].dropna()

# Display the number of observations after dropping NaNs
print(f"Number of observations after dropping NaNs: {len(analysis_df)}")

Number of observations after dropping NaNs: 41500


In [18]:
analysis_df.columns

Index(['similarity_to_overall_average', 'similarity_to_industry_average',
       'similarity_to_company_average', 'excess_ret_immediate',
       'excess_ret_short_term', 'excess_ret_medium_term',
       'excess_ret_long_term', 'epsfxq', 'epsfxq_next',
       'length_participant_questions', 'length_management_answers',
       'market_cap', 'rolling_beta', 'ceo_participates', 'ceo_cfo_change',
       'word_length_presentation', 'participant_question_topics',
       'management_answer_topics', 'filtered_presentation_topics'],
      dtype='object')

In [19]:
analysis_df = analysis_df[['similarity_to_overall_average', 'similarity_to_industry_average', 'similarity_to_company_average','length_management_answers', 'length_participant_questions', 'filtered_presentation_topics']]

In [23]:
import ast
import pandas as pd

# Ensure that 'analysis_df' is your DataFrame
# and that 'filtered_presentation_topics' contains string representations of lists.

# Calculate the 20th and 80th quantiles once to avoid recalculating
quantile_20 = analysis_df['similarity_to_industry_average'].quantile(0.2)
quantile_80 = analysis_df['similarity_to_industry_average'].quantile(0.8)

# Filter the lowest 20% of similarity_to_overall_average
lowest_20_df = analysis_df[analysis_df['similarity_to_industry_average'] <= quantile_20]

# Calculate the average length of filtered_presentation_topics vectors for lowest 20%
lowest_20_avg_length = lowest_20_df['filtered_presentation_topics'].apply(
    lambda x: len(ast.literal_eval(x)) if isinstance(x, str) else len(x)
).mean()

print(f"Average length of filtered_presentation_topics (Lowest 20%): {lowest_20_avg_length}")

# Filter the highest 20% of similarity_to_overall_average
highest_20_df = analysis_df[analysis_df['similarity_to_industry_average'] >= quantile_80]

# Calculate the average length of filtered_presentation_topics vectors for highest 20%
highest_20_avg_length = highest_20_df['filtered_presentation_topics'].apply(
    lambda x: len(ast.literal_eval(x)) if isinstance(x, str) else len(x)
).mean()

print(f"Average length of filtered_presentation_topics (Highest 20%): {highest_20_avg_length}")


Average length of filtered_presentation_topics (Lowest 20%): 149.00506024096384
Average length of filtered_presentation_topics (Highest 20%): 130.65867469879518
