In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import tqdm


def get_best_authors_per_sector(df, start_date, end_date, max_authors=10):
    """
    Find the best performing authors per sector based on correlation between
    expected and actual returns within the specified date range.
    """
    # Filter data for the specified period
    mask = (df['date'] >= start_date) & (df['date'] <= end_date)
    period_df = df[mask].copy()
    
    # Calculate correlation for each author-sector pair
    author_sector_performance = []
    
    for (sector, author), group in period_df.groupby(['Sector', 'author']):
        if len(group) < 2:  # Skip if not enough data points
            continue
            
        correlation = np.corrcoef(group['expected_return'], group['actual_return'])[0, 1]
        
        if not np.isnan(correlation) and correlation > 0:
            author_sector_performance.append({
                'Sector': sector,
                'author': author,
                'correlation': correlation
            })
    
    # Convert to DataFrame and get top authors per sector
    performance_df = pd.DataFrame(author_sector_performance)
    if len(performance_df) == 0:
        return pd.DataFrame(columns=['Sector', 'author', 'correlation'])
        
    return (performance_df.sort_values('correlation', ascending=False)
            .groupby('Sector')
            .head(max_authors)
            .reset_index(drop=True))

def calculate_monthly_performance(df, best_authors_df, target_month):
    """
    Calculate performance for each author-sector pair for a specific month
    using sign(expected_return) * actual_return
    """
    # Filter data for the target month and best authors
    month_start = target_month.replace(day=1)
    month_end = (month_start + pd.offsets.MonthEnd(1))
    
    mask = (df['date'] >= month_start) & (df['date'] <= month_end)
    month_df = df[mask].copy()
    
    # Filter for only the best author-sector pairs
    month_df = month_df.merge(
        best_authors_df[['Sector', 'author']],
        on=['Sector', 'author'],
        how='inner'
    )
    
    # Calculate performance
    month_df['performance'] = np.sign(month_df['expected_return']) * month_df['actual_return']

    # Group by Sector and Author to calculate correlation and performance sum
    performance_summary = month_df.groupby(['Sector', 'author']).agg(
        correlation=('expected_return', lambda x: np.corrcoef(
            x, 
            month_df.loc[x.index, 'actual_return']
        )[0, 1] if len(x) > 1 else 0),  # Avoid errors with single data points
        sum_performance=('performance', 'sum'),
        count=('performance', 'count')
    ).reset_index()
    
    # Compute logical AND (&&) of correlation and sum_performance
    # Logical AND: True (1) if both are non-zero, False (0) otherwise
    performance_summary['and_correlation_sum'] = (
        (performance_summary['correlation'] >= 0) & (performance_summary['sum_performance'] >= 0)
    ).astype(int)  # Convert to 1 (True) or 0 (False)

    # Add the month column for reference
    performance_summary['month'] = month_start

    return performance_summary

def run_rolling_analysis(df, start_date, end_date, lookback_period=12):
    """
    Run the rolling analysis month by month
    """
    # Convert dates if they're strings
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)
    
    all_monthly_performance = []
    
    current_date = start_date
    while current_date <= end_date:
        # Calculate training period
        training_end = current_date - timedelta(days=1)
        training_start = training_end - pd.DateOffset(months=lookback_period)
        
        # Get best authors based on the training period
        best_authors = get_best_authors_per_sector(
            df,
            training_start,
            training_end
        )
        
        # Calculate performance for the current month
        if not best_authors.empty:
            monthly_perf = calculate_monthly_performance(
                df,
                best_authors,
                current_date
            )
            all_monthly_performance.append(monthly_perf)
        
        # Move to next month
        current_date = current_date + pd.DateOffset(months=1)
    
    # Combine all results
    if all_monthly_performance:
        return pd.concat(all_monthly_performance, ignore_index=True)
    return pd.DataFrame()

def print_monthly_sector_author_performance(results_df):
    """
    Print formatted results showing performance by month, sector, and author
    """
    # Sort by month, sector, and mean performance
    sorted_results = results_df.sort_values(
        ['month', 'Sector', 'and_correlation_sum'],
        ascending=[True, True, False]
    )
    
    # Format and print results
    current_month = None
    current_sector = None
    
    for _, row in sorted_results.iterrows():
        month_str = row['month'].strftime('%Y-%m')
        
        if month_str != current_month:
            print(f"\n=== Month: {month_str} ===")
            current_month = month_str
            current_sector = None
        
        if row['Sector'] != current_sector:
            print(f"\nSector: {row['Sector']}")
            current_sector = row['Sector']
        
        print(f"  Author: {row['author']:<30} "
              f"Correlation AND Sum of Performance: {row['and_correlation_sum']:6.3f} "
              f"Number of Predictions: {row['count']:3d}")

# Example usage
def main(df):
    # Convert date column to datetime if it's not already
    df['date'] = pd.to_datetime(df['date'])
    
    # Run the analysis
    results = run_rolling_analysis(
        df,
        start_date='2014-06-01',
        end_date='2024-05-31',
        lookback_period=12
    )
    
    # Print results
    print_monthly_sector_author_performance(results)
    
    return results

In [None]:
# Load the filtered DataFrame from the CSV file
df = pd.read_csv('input_data/filtered_data_2.csv')
df = df.dropna(subset=['expected_return', 'actual_return'])


# Display the first few rows of the DataFrame to verify it loaded correctly
print(df.head())

In [None]:
df.info()

In [None]:
df2 = main(df)

In [None]:
df2.describe()