In [29]:
# Enable auto-reloading of external modules - useful during development
%load_ext autoreload
%autoreload 2

# Configure Python path to find our custom modules
import sys
from pathlib import Path

# Add project root to the Python path for proper imports
project_root = Path.cwd().parent
if project_root not in sys.path:
    sys.path.insert(0, str(project_root))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [30]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate

import src.processing as processing
import src.config as lists

In [31]:
# Load and process data
df = processing.load_data("/Users/luis.m/Library/Mobile Documents/com~apple~CloudDocs/Documents ☁️/VSC Projects/Master_Thesis/data/raw/nvzfxcoxdvh1at7i.csv")
df_prepared = processing.prepare_data(df)
df_added_features = processing.create_all_model_features_orchestrated(df_prepared)
df_missing = processing.drop_missing_final_vars_streamlined(df_added_features, lists.final_set_A_predictor_names_and_dependent)
df_final = processing.annual_winsorize_variables(df_missing, lists.columns_to_winsorize)

  data = pd.read_csv(file_path)


Data loaded successfully from /Users/luis.m/Library/Mobile Documents/com~apple~CloudDocs/Documents ☁️/VSC Projects/Master_Thesis/data/raw/nvzfxcoxdvh1at7i.csv
Original number of observations: 317304
Number of columns after selection: 30
Observations after year filter (2000-2023): 302751
Observations after excluding financial and utility firms: 170598
Starting feature construction. Initial df shape: (170598, 30)
  Creating lags for: ['at', 'ni', 'rect', 'invt', 'ap', 'sale']

Performing pre-calculation validity checks & preparations...
  Missing 'xrd' values filled with 0.
  'ipo_year' created from 'ipodate'.

Constructing dependent variable...
  OCF_Scaled_t_plus_1 created.

Constructing Set A (OLS) predictors...
  Set A predictors constructed.

Constructing control dummy variables...
  Dummy variables constructed.

Constructing Set B (additional ML) predictors...
  Set B predictors constructed.

Selecting final model variables and dropping intermediate columns...
  Shape of DataFrame 

In [32]:
# =============================================================================
# ANALYTICAL SAMPLE OVERVIEW
# =============================================================================

# Print basic information about the final analytical sample
print(f"Final analytical sample: {len(df_final):,} observations")
print(f"Time period: {df_final['fyear'].min():.0f} - {df_final['fyear'].max():.0f}")
print(f"Unique firms: {df_final['gvkey'].nunique():,}")

# Print dataset shape and columns
print(f"\nDataset shape: {df_final.shape}")
print(f"Number of columns: {len(df_final.columns)}")

Final analytical sample: 123,449 observations
Time period: 2001 - 2022
Unique firms: 14,635

Dataset shape: (123449, 30)
Number of columns: 30


In [33]:
# =============================================================================
# DEFINE VARIABLE GROUPS FOR DESCRIPTIVE STATISTICS
# =============================================================================

# Define variable groups for the descriptive statistics table
DEPENDENT_VAR = [lists.DEPENDENT_VARIABLE]  # OCF_Scaled_t_plus_1

# Set A predictors (8 financial variables + 2 dummies used)
# Note: We're using the same versions as in the other exploration files 
# (excluding ASC842_dummy and COVID_dummy that were dropped for multicollinearity)
SET_A_CONTINUOUS = lists.SET_A_FEATURES
SET_A_DUMMIES = ['ASC606_dummy', 'TCJA_dummy']  

# Set B: Additional ML predictors
SET_B_FEATURES = lists.SET_B_FEATURES

# All variables for the descriptive statistics table
ALL_DESC_VARS = DEPENDENT_VAR + SET_A_CONTINUOUS + SET_A_DUMMIES + SET_B_FEATURES

print("Variables for descriptive statistics:")
print(f"• Dependent variable: {len(DEPENDENT_VAR)} variable")
print(f"• Set A continuous: {len(SET_A_CONTINUOUS)} variables")
print(f"• Set A dummies: {len(SET_A_DUMMIES)} variables")
print(f"• Set B features: {len(SET_B_FEATURES)} variables")
print(f"• Total: {len(ALL_DESC_VARS)} variables")

Variables for descriptive statistics:
• Dependent variable: 1 variable
• Set A continuous: 8 variables
• Set A dummies: 2 variables
• Set B features: 14 variables
• Total: 25 variables


In [34]:
# =============================================================================
# CALCULATE DESCRIPTIVE STATISTICS
# =============================================================================

# Create subset with only variables needed for descriptive statistics
desc_data = df_final[ALL_DESC_VARS].copy()

# Calculate comprehensive descriptive statistics
desc_stats = desc_data.describe(percentiles=[0.25, 0.75]).T

# Add count of non-missing observations
desc_stats['N'] = desc_data.count()

# Reorder columns for thesis table format
desc_stats = desc_stats[['N', 'mean', '50%', 'std', 'min', '25%', '75%', 'max']]

# Rename columns for clarity
desc_stats.columns = ['N', 'Mean', 'Median', 'Std Dev', 'Min', 'P25', 'P75', 'Max']

print("Descriptive Statistics Summary:")
print(desc_stats.round(4))

# Check for missing values (important for Set B features)
missing_counts = desc_data.isnull().sum()
vars_with_missing = missing_counts[missing_counts > 0].sort_values(ascending=False)

if len(vars_with_missing) > 0:
    print("\nVariables with missing values:")
    for var, count in vars_with_missing.items():
        pct = count / len(desc_data) * 100
        print(f"  • {var}: {count:,} missing ({pct:.1f}%)")
else:
    print("\nNo missing values in the selected variables.")

Descriptive Statistics Summary:
                           N     Mean   Median    Std Dev       Min     P25  \
OCF_Scaled_t_plus_1   123449  -0.1849   0.0512     1.0461  -14.1542 -0.0902   
OCF_Scaled_Lag_t      123449  -0.2218   0.0512     1.2437  -16.5556 -0.0963   
NI_Scaled_t           123449  -0.5316   0.0014     2.5049  -37.5031 -0.2107   
Accruals_Scaled_t     123449  -0.3306  -0.0626     1.6044  -30.4779 -0.1397   
Delta_Rec_Scaled_t    123449   0.0018   0.0024     0.0778   -0.7255 -0.0086   
Delta_Inv_Scaled_t    123449   0.0029   0.0000     0.0431   -0.3595 -0.0006   
Delta_AP_Scaled_t     123449   0.0101   0.0025     0.1430   -1.1000 -0.0078   
DP_Scaled_t           123449   0.0456   0.0341     0.0506    0.0000  0.0165   
ln_at_t               123449   5.2067   5.3639     2.9305   -3.9120  3.3546   
ASC606_dummy          123449   0.2106   0.0000     0.4078    0.0000  0.0000   
TCJA_dummy            123449   0.2106   0.0000     0.4078    0.0000  0.0000   
XSGA_Scaled_t       

In [35]:
# =============================================================================
# GENERATE LATEX TABLE FOR THESIS
# =============================================================================

def create_latex_table(desc_stats_df, title="Descriptive Statistics"):
    """
    Create a LaTeX table for descriptive statistics
    """
    # Define variable groups
    var_groups = {
        'OCF_Scaled_t_plus_1': 'Panel A: Dependent Variable',
        'OCF_Scaled_Lag_t': 'Panel B: Set A Financial Variables',
        'ASC606_dummy': 'Panel C: Set A Control Variables',
        'XSGA_Scaled_t': 'Panel D: Set B Additional ML Variables'
    }
    
    # Start LaTeX code
    latex = [
        "\\begin{table}[htbp]",
        "\\centering",
        f"\\caption{{{title}}}",
        "\\label{tab:descriptive_stats}",
        "\\begin{adjustbox}{width=\\textwidth}",
        "\\begin{tabular}{lrrrrrrrr}",
        "\\toprule",
        "Variable & N & Mean & Median & Std Dev & Min & P25 & P75 & Max \\\\",
        "\\midrule"
    ]
    
    # Track current group
    current_group = None
    
    # Add rows by variable group
    for var in desc_stats_df.index:
        # Check if this variable starts a new group
        if var in var_groups:
            group_name = var_groups[var]
            if current_group != group_name:
                if current_group is not None:
                    latex.append("\\midrule")
                latex.append(f"\\multicolumn{{9}}{{l}}{{\\textbf{{{group_name}}}}} \\\\")
                current_group = group_name
        
        # Format variable name (replace underscores with escaped underscores)
        var_name = var.replace("_", "\\_")
        
        # Get statistics
        stats = desc_stats_df.loc[var]
        
        # Format values based on magnitude
        def format_value(val):
            if pd.isna(val):
                return "--"
            elif abs(val) >= 1000:
                return f"{val:,.0f}"
            elif abs(val) >= 1:
                return f"{val:.3f}"
            else:
                return f"{val:.4f}"
        
        # Get formatted statistics
        n_val = f"{int(stats['N']):,}"
        mean_val = format_value(stats['Mean'])
        median_val = format_value(stats['Median'])
        std_val = format_value(stats['Std Dev'])
        min_val = format_value(stats['Min'])
        p25_val = format_value(stats['P25'])
        p75_val = format_value(stats['P75'])
        max_val = format_value(stats['Max'])
        
        # Add row
        latex.append(f"{var_name} & {n_val} & {mean_val} & {median_val} & {std_val} & {min_val} & {p25_val} & {p75_val} & {max_val} \\\\")
    
    # Finish the table
    latex.extend([
        "\\bottomrule",
        "\\end{tabular}",
        "\\end{adjustbox}",
        "",
        "\\begin{tablenotes}",
        "\\small",
        f"\\item \\textit{{Notes:}} This table presents descriptive statistics for the final analytical sample of {len(df_final):,} firm-year observations covering the period {df_final['fyear'].min():.0f}--{df_final['fyear'].max():.0f}. All continuous variables are winsorized at the 1\\% and 99\\% levels annually to reduce the influence of outliers. Set A variables are the core financial predictors used in the OLS analysis. Set B variables are additional predictors used in machine learning models.",
        "\\end{tablenotes}",
        "\\end{table}"
    ])
    
    return "\n".join(latex)

# Generate LaTeX table
latex_table = create_latex_table(desc_stats)

# Display the LaTeX code
print("="*80)
print("LATEX TABLE FOR THESIS SECTION 4.1")
print("="*80)
print(latex_table)
print("="*80)

LATEX TABLE FOR THESIS SECTION 4.1
\begin{table}[htbp]
\centering
\caption{Descriptive Statistics}
\label{tab:descriptive_stats}
\begin{adjustbox}{width=\textwidth}
\begin{tabular}{lrrrrrrrr}
\toprule
Variable & N & Mean & Median & Std Dev & Min & P25 & P75 & Max \\
\midrule
\multicolumn{9}{l}{\textbf{Panel A: Dependent Variable}} \\
OCF\_Scaled\_t\_plus\_1 & 123,449 & -0.1849 & 0.0512 & 1.046 & -14.154 & -0.0902 & 0.1237 & 0.6219 \\
\midrule
\multicolumn{9}{l}{\textbf{Panel B: Set A Financial Variables}} \\
OCF\_Scaled\_Lag\_t & 123,449 & -0.2218 & 0.0512 & 1.244 & -16.556 & -0.0963 & 0.1249 & 0.7453 \\
NI\_Scaled\_t & 123,449 & -0.5316 & 0.0014 & 2.505 & -37.503 & -0.2107 & 0.0602 & 0.5621 \\
Accruals\_Scaled\_t & 123,449 & -0.3306 & -0.0626 & 1.604 & -30.478 & -0.1397 & -0.0173 & 0.9692 \\
Delta\_Rec\_Scaled\_t & 123,449 & 0.0018 & 0.0024 & 0.0778 & -0.7255 & -0.0086 & 0.0224 & 0.3026 \\
Delta\_Inv\_Scaled\_t & 123,449 & 0.0029 & 0.0000 & 0.0431 & -0.3595 & -0.0006 & 0.0086 & 0.2038

In [36]:
# =============================================================================
# SAVE LATEX TABLE TO FILE
# =============================================================================

# Create directory for tables if it doesn't exist
tables_dir = project_root / "tables"
tables_dir.mkdir(exist_ok=True, parents=True)

# Save LaTeX table to file
latex_file = tables_dir / "descriptive_statistics.tex"
with open(latex_file, "w") as f:
    f.write(latex_table)

print(f"✅ LaTeX table saved to: {latex_file}")
print("\nThis file can be directly included in your thesis using:")
print("\\input{tables/descriptive_statistics.tex}")

✅ LaTeX table saved to: /Users/luis.m/Library/Mobile Documents/com~apple~CloudDocs/Documents ☁️/VSC Projects/Master_Thesis/tables/descriptive_statistics.tex

This file can be directly included in your thesis using:
\input{tables/descriptive_statistics.tex}


In [37]:
# =============================================================================
# GENERATE PDF VERSION OF TABLE
# =============================================================================

def create_pdf_table(desc_stats_df, output_path, title="Descriptive Statistics"):
    """
    Create a PDF version of the descriptive statistics table
    """
    # Set figure size based on number of variables
    fig_height = max(11, len(desc_stats_df) * 0.4)
    fig = plt.figure(figsize=(12, fig_height), dpi=300)
    
    # Create axis for table
    ax = fig.add_subplot(111)
    ax.axis('off')
    
    # Define variable groups
    groups = [
        ('Panel A: Dependent Variable', DEPENDENT_VAR),
        ('Panel B: Set A Financial Variables', SET_A_CONTINUOUS),
        ('Panel C: Set A Control Variables', SET_A_DUMMIES),
        ('Panel D: Set B Additional ML Variables', SET_B_FEATURES)
    ]
    
    # Prepare table data
    table_data = []
    colors = []
    
    # Add data by group
    for group_name, variables in groups:
        # Add group header
        table_data.append([group_name, '', '', '', '', '', '', '', ''])
        colors.append('#D0D0D0')  # Gray for group headers
        
        # Add variables in this group
        for var in variables:
            if var in desc_stats_df.index:
                row = desc_stats_df.loc[var]
                
                # Format values
                def format_value(val):
                    if pd.isna(val):
                        return "--"
                    elif abs(val) >= 1000:
                        return f"{val:,.0f}"
                    elif abs(val) >= 1:
                        return f"{val:.3f}"
                    else:
                        return f"{val:.4f}"
                
                # Create row with formatted values
                formatted_row = [
                    var,
                    f"{int(row['N']):,}",
                    format_value(row['Mean']),
                    format_value(row['Median']),
                    format_value(row['Std Dev']),
                    format_value(row['Min']),
                    format_value(row['P25']),
                    format_value(row['P75']),
                    format_value(row['Max'])
                ]
                table_data.append(formatted_row)
                
                # Alternate row colors
                colors.append('#FFFFFF' if len(table_data) % 2 == 0 else '#F2F2F2')
    
    # Create table
    column_headers = ['Variable', 'N', 'Mean', 'Median', 'Std Dev', 'Min', 'P25', 'P75', 'Max']
    table = ax.table(
        cellText=table_data,
        colLabels=column_headers,
        loc='center',
        cellLoc='center'
    )
    
    # Style the table
    table.auto_set_font_size(False)
    table.set_fontsize(9)
    table.scale(1, 1.8)  # Make rows taller
    
    # Style header row
    for i, column in enumerate(column_headers):
        cell = table[(0, i)]
        cell.set_facecolor('#4472C4')
        cell.set_text_props(color='white', fontweight='bold')
    
    # Style data rows
    for i, color in enumerate(colors, 1):
        for j in range(len(column_headers)):
            cell = table[(i, j)]
            cell.set_facecolor(color)
            
            # Bold text for group headers
            if i-1 < len(colors) and colors[i-1] == '#D0D0D0':
                cell.set_text_props(fontweight='bold')
    
    # Add title
    sample_info = f"Final Analytical Sample: {len(df_final):,} observations ({df_final['fyear'].min():.0f}-{df_final['fyear'].max():.0f})"
    plt.title(f"{title}\n{sample_info}", fontsize=14, pad=20)
    
    # Add notes
    note_text = "Notes: All continuous variables are winsorized at the 1% and 99% levels annually."
    plt.figtext(0.1, 0.01, note_text, fontsize=9)
    
    # Save as PDF
    plt.tight_layout()
    plt.savefig(output_path, format='pdf', bbox_inches='tight')
    plt.close()
    
    return output_path

# Generate PDF table
pdf_file = tables_dir / "descriptive_statistics.pdf"
create_pdf_table(desc_stats, pdf_file)

print(f"✅ PDF table saved to: {pdf_file}")

✅ PDF table saved to: /Users/luis.m/Library/Mobile Documents/com~apple~CloudDocs/Documents ☁️/VSC Projects/Master_Thesis/tables/descriptive_statistics.pdf


In [38]:
# =============================================================================
# KEY HIGHLIGHTS FOR THESIS TEXT
# =============================================================================

print("KEY HIGHLIGHTS FOR THESIS TEXT:")
print("="*50)

# Sample size and time period
total_obs = len(df_final)
unique_firms = df_final['gvkey'].nunique()
min_year = df_final['fyear'].min()
max_year = df_final['fyear'].max()

print(f"📊 Final analytical sample: {total_obs:,} firm-year observations")
print(f"🏢 Unique firms: {unique_firms:,}")
print(f"📅 Time period: {min_year:.0f}-{max_year:.0f} ({int(max_year-min_year+1)} years)")

# Dependent variable characteristics
dep_var = lists.DEPENDENT_VARIABLE
dep_mean = desc_stats.loc[dep_var, 'Mean']
dep_std = desc_stats.loc[dep_var, 'Std Dev']
dep_median = desc_stats.loc[dep_var, 'Median']

print(f"\n💰 Dependent Variable ({dep_var}):")
print(f"   Mean: {dep_mean:.4f}, Median: {dep_median:.4f}, Std Dev: {dep_std:.4f}")

# Missing data patterns (important for Set B features and RandomForest discussion)
print(f"\n📋 Missing Data Patterns in Set B:")
missing_summary = desc_data[SET_B_FEATURES].isnull().sum().sort_values(ascending=False)
vars_with_missing = missing_summary[missing_summary > 0]

if len(vars_with_missing) > 0:
    print(f"   Variables with missing data: {len(vars_with_missing)} out of {len(SET_B_FEATURES)}")
    for var, missing_count in vars_with_missing.items():
        missing_pct = (missing_count / total_obs) * 100
        print(f"   • {var}: {missing_count:,} missing ({missing_pct:.1f}%)")
else:
    print(f"   ✅ No missing values in Set B features")

# Set A vs. Set B statistical differences
print(f"\n🔍 Notable Statistical Observations:")
print(f"   • Dependent variable distribution: Mean ({dep_mean:.4f}) vs. Median ({dep_median:.4f})")

# Check feature with highest standard deviation (relative to mean)
rel_std = desc_stats['Std Dev'] / desc_stats['Mean'].abs()
high_var_feature = rel_std.nlargest(3)
print(f"   • Highest relative variability features:")
for feature, value in high_var_feature.items():
    mean = desc_stats.loc[feature, 'Mean']
    std = desc_stats.loc[feature, 'Std Dev']
    print(f"     - {feature}: Mean={mean:.4f}, Std={std:.4f}, Rel.StdDev={value:.2f}")

KEY HIGHLIGHTS FOR THESIS TEXT:
📊 Final analytical sample: 123,449 firm-year observations
🏢 Unique firms: 14,635
📅 Time period: 2001-2022 (22 years)

💰 Dependent Variable (OCF_Scaled_t_plus_1):
   Mean: -0.1849, Median: 0.0512, Std Dev: 1.0461

📋 Missing Data Patterns in Set B:
   Variables with missing data: 9 out of 14
   • FirmAge_t: 64,336 missing (52.1%)
   • InvTurnover_t: 40,493 missing (32.8%)
   • MkBk_t: 16,538 missing (13.4%)
   • OCFtoSales_t: 14,146 missing (11.5%)
   • GPM_t: 14,146 missing (11.5%)
   • XSGA_Scaled_t: 11,947 missing (9.7%)
   • RecTurnover_t: 7,338 missing (5.9%)
   • CurrentRatio_t: 1,665 missing (1.3%)
   • CAPX_Scaled_t: 112 missing (0.1%)

🔍 Notable Statistical Observations:
   • Dependent variable distribution: Mean (-0.1849) vs. Median (0.0512)
   • Highest relative variability features:
     - MkBk_t: Mean=17.1054, Std=1449.3590, Rel.StdDev=84.73
     - Delta_Rec_Scaled_t: Mean=0.0018, Std=0.0778, Rel.StdDev=43.28
     - XSGA_Scaled_t: Mean=1.7138,

In [39]:
# Enable auto-reloading of external modules - useful during development
%load_ext autoreload
%autoreload 2

# Configure Python path to find our custom modules
import sys
from pathlib import Path

# Add project root to the Python path for proper imports
project_root = Path.cwd().parent
if project_root not in sys.path:
    sys.path.insert(0, str(project_root))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [40]:
# Import necessary libraries
import pandas as pd
import numpy as np
import src.processing as processing
import src.config as lists
from stargazer.stargazer import Stargazer

In [41]:
# Load and process data to create final analytical sample
df = processing.load_data("/Users/luis.m/Library/Mobile Documents/com~apple~CloudDocs/Documents ☁️/VSC Projects/Master_Thesis/data/raw/nvzfxcoxdvh1at7i.csv")
df_prepared = processing.prepare_data(df)
df_added_features = processing.create_all_model_features_orchestrated(df_prepared)
df_missing = processing.drop_missing_final_vars_streamlined(df_added_features, lists.final_set_A_predictor_names_and_dependent)
df_final = processing.annual_winsorize_variables(df_missing, lists.columns_to_winsorize)

print(f"Final analytical sample: {len(df_final):,} observations")
print(f"Time period: {df_final['fyear'].min():.0f} - {df_final['fyear'].max():.0f}")
print(f"Dataset shape: {df_final.shape}")

  data = pd.read_csv(file_path)


Data loaded successfully from /Users/luis.m/Library/Mobile Documents/com~apple~CloudDocs/Documents ☁️/VSC Projects/Master_Thesis/data/raw/nvzfxcoxdvh1at7i.csv
Original number of observations: 317304
Number of columns after selection: 30
Observations after year filter (2000-2023): 302751
Observations after excluding financial and utility firms: 170598
Starting feature construction. Initial df shape: (170598, 30)
  Creating lags for: ['at', 'ni', 'rect', 'invt', 'ap', 'sale']

Performing pre-calculation validity checks & preparations...
  Missing 'xrd' values filled with 0.
  'ipo_year' created from 'ipodate'.

Constructing dependent variable...
Starting feature construction. Initial df shape: (170598, 30)
  Creating lags for: ['at', 'ni', 'rect', 'invt', 'ap', 'sale']

Performing pre-calculation validity checks & preparations...
  Missing 'xrd' values filled with 0.
  'ipo_year' created from 'ipodate'.

Constructing dependent variable...
  OCF_Scaled_t_plus_1 created.

Constructing Set 

In [42]:
# =============================================================================
# DESCRIPTIVE STATISTICS FOR THESIS TABLE 4.1
# =============================================================================

# Define variable groups for the descriptive statistics table
DEPENDENT_VAR = [lists.DEPENDENT_VARIABLE]  # OCF_Scaled_t_plus_1

# Set A: Core financial predictors (8 continuous + 2 dummies)
SET_A_CONTINUOUS = lists.SET_A_FEATURES  # 8 financial variables
SET_A_DUMMIES = ['ASC606_dummy', 'TCJA_dummy']  # 2 main dummies used in models

# Set B: Additional ML predictors (14 variables)
SET_B_FEATURES = [
    'XSGA_Scaled_t', 'XRD_Scaled_t', 'CAPX_Scaled_t', 'CurrentRatio_t',
    'DebtToAssets_t', 'OCFtoSales_t', 'InvTurnover_t', 'RecTurnover_t',
    'GPM_t', 'Delta_Sales_Scaled_t', 'NI_Scaled_Lag_t',
    'CapitalIntensity_t', 'MkBk_t', 'FirmAge_t'
]

# All variables for the descriptive statistics table
ALL_DESC_VARS = DEPENDENT_VAR + SET_A_CONTINUOUS + SET_A_DUMMIES + SET_B_FEATURES

print("Variables for descriptive statistics:")
print(f"• Dependent variable: {len(DEPENDENT_VAR)} variable")
print(f"• Set A continuous: {len(SET_A_CONTINUOUS)} variables")
print(f"• Set A dummies: {len(SET_A_DUMMIES)} variables")
print(f"• Set B features: {len(SET_B_FEATURES)} variables")
print(f"• Total: {len(ALL_DESC_VARS)} variables")

Variables for descriptive statistics:
• Dependent variable: 1 variable
• Set A continuous: 8 variables
• Set A dummies: 2 variables
• Set B features: 14 variables
• Total: 25 variables


In [43]:
# Create descriptive statistics DataFrame
desc_data = df_final[ALL_DESC_VARS].copy()

# Calculate comprehensive descriptive statistics
desc_stats = desc_data.describe(percentiles=[0.25, 0.75]).T

# Add count of non-missing observations
desc_stats['N'] = desc_data.count()

# Reorder columns for thesis table format
desc_stats = desc_stats[['N', 'mean', '50%', 'std', 'min', '25%', '75%', 'max']]

# Rename columns for clarity
desc_stats.columns = ['N', 'Mean', 'Median', 'Std Dev', 'Min', 'P25', 'P75', 'Max']

print("Descriptive Statistics Summary:")
print(desc_stats.round(4))

Descriptive Statistics Summary:
                           N     Mean   Median    Std Dev       Min     P25  \
OCF_Scaled_t_plus_1   123449  -0.1849   0.0512     1.0461  -14.1542 -0.0902   
OCF_Scaled_Lag_t      123449  -0.2218   0.0512     1.2437  -16.5556 -0.0963   
NI_Scaled_t           123449  -0.5316   0.0014     2.5049  -37.5031 -0.2107   
Accruals_Scaled_t     123449  -0.3306  -0.0626     1.6044  -30.4779 -0.1397   
Delta_Rec_Scaled_t    123449   0.0018   0.0024     0.0778   -0.7255 -0.0086   
Delta_Inv_Scaled_t    123449   0.0029   0.0000     0.0431   -0.3595 -0.0006   
Delta_AP_Scaled_t     123449   0.0101   0.0025     0.1430   -1.1000 -0.0078   
DP_Scaled_t           123449   0.0456   0.0341     0.0506    0.0000  0.0165   
ln_at_t               123449   5.2067   5.3639     2.9305   -3.9120  3.3546   
ASC606_dummy          123449   0.2106   0.0000     0.4078    0.0000  0.0000   
TCJA_dummy            123449   0.2106   0.0000     0.4078    0.0000  0.0000   
XSGA_Scaled_t       

In [44]:
# =============================================================================
# GENERATE PDF TABLE FOR DESCRIPTIVE STATISTICS
# =============================================================================

import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.backends.backend_pdf import PdfPages

def create_descriptive_pdf_table(desc_stats_df, output_path, title="Descriptive Statistics"):
    """
    Create a professional PDF table for descriptive statistics.
    """
    
    # Define variable group markers and labels
    var_groups = {
        'OCF_Scaled_t_plus_1': 'Dependent Variable',
        'OCF_Scaled_Lag_t': 'Set A: Core Financial Predictors',
        'ASC606_dummy': 'Set A: Control Variables', 
        'XSGA_Scaled_t': 'Set B: Additional ML Predictors'
    }
    
    # Prepare data for table
    table_data = []
    group_rows = []
    
    current_group = None
    row_idx = 0
    
    for var_name in desc_stats_df.index:
        # Check if we need a group header
        if var_name in var_groups:
            if current_group is not None:
                row_idx += 1  # Add space
            table_data.append([var_groups[var_name], '', '', '', '', '', '', '', ''])
            group_rows.append(row_idx)
            row_idx += 1
            current_group = var_groups[var_name]
        
        # Format the variable name
        clean_name = var_name.replace('_', ' ')
        
        # Get the statistics and format them
        row = desc_stats_df.loc[var_name]
        n_val = f"{int(row['N']):,}"
        
        def format_number(val):
            if pd.isna(val):
                return "--"
            elif abs(val) >= 1000:
                return f"{val:,.0f}"
            elif abs(val) >= 1:
                return f"{val:.3f}"
            else:
                return f"{val:.4f}"
        
        formatted_row = [
            clean_name,
            n_val,
            format_number(row['Mean']),
            format_number(row['Median']),
            format_number(row['Std Dev']),
            format_number(row['Min']),
            format_number(row['P25']),
            format_number(row['P75']),
            format_number(row['Max'])
        ]
        
        table_data.append(formatted_row)
        row_idx += 1
    
    # Create the PDF
    fig, ax = plt.subplots(figsize=(16, max(12, len(table_data) * 0.4)))
    ax.axis('tight')
    ax.axis('off')
    
    # Create table
    table = ax.table(cellText=table_data,
                    colLabels=['Variable', 'N', 'Mean', 'Median', 'Std Dev', 'Min', 'P25', 'P75', 'Max'],
                    cellLoc='center',
                    loc='center',
                    bbox=[0, 0, 1, 1])
    
    # Style the table
    table.auto_set_font_size(False)
    table.set_fontsize(9)
    table.scale(1, 1.8)
    
    # Style header row
    for i in range(9):
        table[(0, i)].set_facecolor('#4472C4')
        table[(0, i)].set_text_props(weight='bold', color='white')
        table[(0, i)].set_height(0.08)
    
    # Style group header rows
    for group_row in group_rows:
        for i in range(9):
            table[(group_row + 1, i)].set_facecolor('#D9E2F3')
            table[(group_row + 1, i)].set_text_props(weight='bold')
            table[(group_row + 1, i)].set_height(0.06)
    
    # Style data rows
    for i in range(1, len(table_data) + 1):
        if i - 1 not in group_rows:  # Skip group headers
            bg_color = '#F2F2F2' if i % 2 == 0 else 'white'
            for j in range(9):
                table[(i, j)].set_facecolor(bg_color)
                table[(i, j)].set_height(0.05)
    
    # Add title
    plt.title(f'{title}\nFinal Analytical Sample: {len(df_final):,} observations ({df_final["fyear"].min():.0f}-{df_final["fyear"].max():.0f})', 
              fontsize=14, fontweight='bold', pad=20)
    
    # Save to PDF
    plt.tight_layout()
    plt.savefig(output_path, format='pdf', bbox_inches='tight', dpi=300)
    plt.close()
    
    print(f"✅ PDF table saved to: {output_path}")

# Create tables directory if it doesn't exist
tables_dir = "/Users/luis.m/Library/Mobile Documents/com~apple~CloudDocs/Documents ☁️/VSC Projects/Master_Thesis/tables"
Path(tables_dir).mkdir(parents=True, exist_ok=True)

# Generate the PDF table
output_path = f"{tables_dir}/descriptive_statistics.pdf"
create_descriptive_pdf_table(desc_stats, output_path, "Descriptive Statistics")

print("="*80)
print("PDF TABLE GENERATED FOR THESIS SECTION 4.1")
print("="*80)
print(f"File location: {output_path}")
print("This PDF can be directly included in your thesis or used as a reference.")
print("="*80)

✅ PDF table saved to: /Users/luis.m/Library/Mobile Documents/com~apple~CloudDocs/Documents ☁️/VSC Projects/Master_Thesis/tables/descriptive_statistics.pdf
PDF TABLE GENERATED FOR THESIS SECTION 4.1
File location: /Users/luis.m/Library/Mobile Documents/com~apple~CloudDocs/Documents ☁️/VSC Projects/Master_Thesis/tables/descriptive_statistics.pdf
This PDF can be directly included in your thesis or used as a reference.


In [45]:
# =============================================================================
# KEY HIGHLIGHTS FROM DESCRIPTIVE STATISTICS
# =============================================================================

print("KEY HIGHLIGHTS FOR THESIS TEXT:")
print("="*50)

# Sample size
total_obs = len(df_final)
print(f"📊 Final analytical sample: {total_obs:,} firm-year observations")

# Time span
min_year = df_final['fyear'].min()
max_year = df_final['fyear'].max()
print(f"📅 Time period: {min_year:.0f}-{max_year:.0f} ({int(max_year-min_year+1)} years)")

# Dependent variable characteristics
dep_var = lists.DEPENDENT_VARIABLE
dep_mean = desc_stats.loc[dep_var, 'Mean']
dep_std = desc_stats.loc[dep_var, 'Std Dev']
dep_median = desc_stats.loc[dep_var, 'Median']
print(f"\n💰 Dependent Variable ({dep_var}):")
print(f"   Mean: {dep_mean:.4f}, Median: {dep_median:.4f}, Std Dev: {dep_std:.4f}")

# Missing data patterns
print(f"\n📋 Missing Data Patterns:")
missing_summary = desc_data.isnull().sum().sort_values(ascending=False)
vars_with_missing = missing_summary[missing_summary > 0]
if len(vars_with_missing) > 0:
    print(f"   Variables with missing data: {len(vars_with_missing)}")
    for var, missing_count in vars_with_missing.head(5).items():
        missing_pct = (missing_count / total_obs) * 100
        print(f"   • {var}: {missing_count:,} ({missing_pct:.1f}%)")
else:
    print(f"   ✅ No missing values in final analytical sample")

# Variable ranges
print(f"\n📈 Notable Variable Characteristics:")
for var_group, vars_list in [('Set A Financial', SET_A_CONTINUOUS), ('Set B Additional', SET_B_FEATURES[:5])]:
    print(f"   {var_group} variables (showing first 5):")
    for var in vars_list:
        if var in desc_stats.index:
            mean_val = desc_stats.loc[var, 'Mean']
            std_val = desc_stats.loc[var, 'Std Dev']
            print(f"     {var}: Mean={mean_val:.4f}, Std={std_val:.4f}")

KEY HIGHLIGHTS FOR THESIS TEXT:
📊 Final analytical sample: 123,449 firm-year observations
📅 Time period: 2001-2022 (22 years)

💰 Dependent Variable (OCF_Scaled_t_plus_1):
   Mean: -0.1849, Median: 0.0512, Std Dev: 1.0461

📋 Missing Data Patterns:
   Variables with missing data: 9
   • FirmAge_t: 64,336 (52.1%)
   • InvTurnover_t: 40,493 (32.8%)
   • MkBk_t: 16,538 (13.4%)
   • GPM_t: 14,146 (11.5%)
   • OCFtoSales_t: 14,146 (11.5%)

📈 Notable Variable Characteristics:
   Set A Financial variables (showing first 5):
     OCF_Scaled_Lag_t: Mean=-0.2218, Std=1.2437
     NI_Scaled_t: Mean=-0.5316, Std=2.5049
     Accruals_Scaled_t: Mean=-0.3306, Std=1.6044
     Delta_Rec_Scaled_t: Mean=0.0018, Std=0.0778
     Delta_Inv_Scaled_t: Mean=0.0029, Std=0.0431
     Delta_AP_Scaled_t: Mean=0.0101, Std=0.1430
     DP_Scaled_t: Mean=0.0456, Std=0.0506
     ln_at_t: Mean=5.2067, Std=2.9305
   Set B Additional variables (showing first 5):
     XSGA_Scaled_t: Mean=1.7138, Std=41.7467
     XRD_Scaled_t: 

In [46]:
# =============================================================================
# SAVE LATEX TABLE TO FILE
# =============================================================================

# Save the LaTeX table to a file for easy inclusion in thesis
output_file = "/Users/luis.m/Library/Mobile Documents/com~apple~CloudDocs/Documents ☁️/VSC Projects/Master_Thesis/tables/descriptive_statistics.tex"

# Create directory if it doesn't exist
Path(output_file).parent.mkdir(parents=True, exist_ok=True)

# Write LaTeX table to file
with open(output_file, 'w') as f:
    f.write(latex_table)

print(f"✅ LaTeX table saved to: {output_file}")
print("\nThis file can be directly included in your thesis using:")
print("\\input{tables/descriptive_statistics.tex}")

✅ LaTeX table saved to: /Users/luis.m/Library/Mobile Documents/com~apple~CloudDocs/Documents ☁️/VSC Projects/Master_Thesis/tables/descriptive_statistics.tex

This file can be directly included in your thesis using:
\input{tables/descriptive_statistics.tex}


In [47]:
# =============================================================================
# GENERATE PDF TABLE OF DESCRIPTIVE STATISTICS
# =============================================================================

import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from pathlib import Path

def create_pdf_table(desc_stats_df):
    """
    Create a professional PDF table for descriptive statistics using the actual data from the notebook.
    """
    # Create figure
    fig, ax = plt.subplots(figsize=(12, 16))
    ax.axis('tight')
    ax.axis('off')
    
    # Prepare data for table
    table_data = []
    
    # Headers for sections
    group_headers = {
        'OCF_Scaled_t_plus_1': "Dependent Variable",
        'OCF_Scaled_Lag_t': "Set A: Core Financial Predictors",
        'ASC606_dummy': "Set A: Control Variables",
        'XSGA_Scaled_t': "Set B: Additional ML Predictors"
    }
    
    # Track header rows for formatting
    header_rows = []
    row_count = 0
    
    # Determine which variables belong to which group
    for var in desc_stats_df.index:
        # Add section header if this is the first variable of its group
        if var in group_headers:
            # Add section header
            header_rows.append(row_count)
            section_header = [group_headers[var], "", "", "", "", "", "", ""]  # 8 columns total
            table_data.append(section_header)
            row_count += 1
        
        # Add the variable data row
        row = desc_stats_df.loc[var]
        data_row = [
            var.replace('_', ' '),  # Clean up variable name
            f"{int(row['N']):,}",
            f"{row['Mean']:.4f}",
            f"{row['Median']:.4f}",
            f"{row['Std Dev']:.4f}",
            f"{row['Min']:.4f}",
            f"{row['P25']:.4f}",
            f"{row['P75']:.4f}"
        ]
        table_data.append(data_row)
        row_count += 1
    
    # Create table
    column_headers = ['Variable', 'N', 'Mean', 'Median', 'Std Dev', 'Min', 'P25', 'P75']
    table = ax.table(
        cellText=table_data,
        colLabels=column_headers,
        loc='center',
        cellLoc='center',
        bbox=[0, 0, 1, 1]
    )
    
    # Style the table
    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.scale(1, 1.5)
    
    # Style the column headers
    for i in range(len(column_headers)):
        table[(0, i)].set_facecolor('#4472C4')
        table[(0, i)].set_text_props(color='white', weight='bold')
    
    # Style the section headers
    for row in header_rows:
        for i in range(len(column_headers)):
            table[(row + 1, i)].set_facecolor('#D9E1F2')
            table[(row + 1, i)].set_text_props(weight='bold')
            if i == 0:
                table[(row + 1, i)].set_text_props(ha='left', weight='bold')
    
    # Style alternating rows
    for i in range(1, len(table_data) + 1):
        if i-1 not in header_rows:  # Skip headers
            color = '#F2F2F2' if i % 2 == 0 else 'white'
            for j in range(len(column_headers)):
                table[(i, j)].set_facecolor(color)
    
    # Add title
    plt.suptitle(f"Descriptive Statistics - Final Analytical Sample ({len(df_final):,} observations)", 
                fontsize=16, fontweight='bold', y=0.95)
    
    # Add footnote
    footnote = (f"Note: Sample includes {len(df_final):,} firm-year observations from "
                f"{df_final['fyear'].min():.0f}-{df_final['fyear'].max():.0f}. "
                f"All continuous variables winsorized at the 1st and 99th percentiles annually.")
    
    plt.figtext(0.1, 0.01, footnote, ha='left', fontsize=10, fontstyle='italic')
    
    return fig

# Path for saving the PDF
output_dir = "/Users/luis.m/Library/Mobile Documents/com~apple~CloudDocs/Documents ☁️/VSC Projects/Master_Thesis/tables"
tables_dir = Path(output_dir)
tables_dir.mkdir(parents=True, exist_ok=True)
pdf_path = tables_dir / "descriptive_statistics.pdf"

# Create and save the PDF
print("📊 Generating PDF table...")
fig = create_pdf_table(desc_stats)

# Save the figure as PDF
fig.savefig(pdf_path, format='pdf', bbox_inches='tight', dpi=300)
plt.close(fig)

print(f"✅ PDF table saved to: {pdf_path}")
print("\nThis PDF can be included in your thesis as a professional-looking table.")

# Generate PDF table
print("📊 Generating PDF table...")
pdf_file = create_descriptive_pdf_table(desc_stats)
print(f"✅ PDF table saved to: {pdf_file}")
print("\nThis PDF can be included in your thesis or used as a standalone reference.")

📊 Generating PDF table...
✅ PDF table saved to: /Users/luis.m/Library/Mobile Documents/com~apple~CloudDocs/Documents ☁️/VSC Projects/Master_Thesis/tables/descriptive_statistics.pdf

This PDF can be included in your thesis as a professional-looking table.
📊 Generating PDF table...
✅ PDF table saved to: /Users/luis.m/Library/Mobile Documents/com~apple~CloudDocs/Documents ☁️/VSC Projects/Master_Thesis/tables/descriptive_statistics.pdf

This PDF can be included in your thesis as a professional-looking table.
📊 Generating PDF table...


TypeError: create_descriptive_pdf_table() missing 1 required positional argument: 'output_path'