In [29]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import tqdm


def get_best_authors_per_sector(df, start_date, end_date, max_authors=10):
    """
    Find the best performing authors per sector based on correlation between
    expected and actual returns within the specified date range.
    """
    # Filter data for the specified period
    mask = (df['date'] >= start_date) & (df['date'] <= end_date)
    period_df = df[mask].copy()
    
    # Calculate correlation for each author-sector pair
    author_sector_performance = []
    
    for (sector, author), group in period_df.groupby(['Sector', 'author']):
        if len(group) < 2:  # Skip if not enough data points
            continue
            
        correlation = np.corrcoef(group['expected_return'], group['actual_return'])[0, 1]
        
        if not np.isnan(correlation) and correlation > 0:
            author_sector_performance.append({
                'Sector': sector,
                'author': author,
                'correlation': correlation
            })
    
    # Convert to DataFrame and get top authors per sector
    performance_df = pd.DataFrame(author_sector_performance)
    if len(performance_df) == 0:
        return pd.DataFrame(columns=['Sector', 'author', 'correlation'])
        
    return (performance_df.sort_values('correlation', ascending=False)
            .groupby('Sector')
            .head(max_authors)
            .reset_index(drop=True))

def calculate_monthly_performance(df, best_authors_df, target_month):
    """
    Calculate performance for each author-sector pair for a specific month
    using sign(expected_return) * actual_return
    """
    # Filter data for the target month and best authors
    month_start = target_month.replace(day=1)
    month_end = (month_start + pd.offsets.MonthEnd(1))
    
    mask = (df['date'] >= month_start) & (df['date'] <= month_end)
    month_df = df[mask].copy()
    
    # Filter for only the best author-sector pairs
    month_df = month_df.merge(
        best_authors_df[['Sector', 'author']],
        on=['Sector', 'author'],
        how='inner'
    )
    
    # Calculate performance
    month_df['performance'] = np.sign(month_df['cumulative_expected_return']) * month_df['cumulative_actual_return']
    
    # Calculate average performance per author-sector
    performance_summary = (month_df.groupby(['Sector', 'author'])
                         ['performance']
                         .agg(['mean', 'count'])
                         .reset_index())
    
    performance_summary['month'] = month_start
    return performance_summary

def run_rolling_analysis(df, start_date, end_date, lookback_period=12):
    """
    Run the rolling analysis month by month
    """
    # Convert dates if they're strings
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)
    
    all_monthly_performance = []
    
    current_date = start_date
    while current_date <= end_date:
        # Calculate training period
        training_end = current_date - timedelta(days=1)
        training_start = training_end - pd.DateOffset(months=lookback_period)
        
        # Get best authors based on the training period
        best_authors = get_best_authors_per_sector(
            df,
            training_start,
            training_end
        )
        
        # Calculate performance for the current month
        if not best_authors.empty:
            monthly_perf = calculate_monthly_performance(
                df,
                best_authors,
                current_date
            )
            all_monthly_performance.append(monthly_perf)
        
        # Move to next month
        current_date = current_date + pd.DateOffset(months=1)
    
    # Combine all results
    if all_monthly_performance:
        return pd.concat(all_monthly_performance, ignore_index=True)
    return pd.DataFrame()

def print_monthly_sector_author_performance(results_df):
    """
    Print formatted results showing performance by month, sector, and author
    """
    # Sort by month, sector, and mean performance
    sorted_results = results_df.sort_values(
        ['month', 'Sector', 'mean'],
        ascending=[True, True, False]
    )
    
    # Format and print results
    current_month = None
    current_sector = None
    
    for _, row in sorted_results.iterrows():
        month_str = row['month'].strftime('%Y-%m')
        
        if month_str != current_month:
            print(f"\n=== Month: {month_str} ===")
            current_month = month_str
            current_sector = None
        
        if row['Sector'] != current_sector:
            print(f"\nSector: {row['Sector']}")
            current_sector = row['Sector']
        
        print(f"  Author: {row['author']:<30} "
              f"Average Performance: {row['mean']:6.3f} "
              f"Number of Predictions: {row['count']:3d}")

# Example usage
def main(df):
    # Convert date column to datetime if it's not already
    df['date'] = pd.to_datetime(df['date'])
    
    # Run the analysis
    results = run_rolling_analysis(
        df,
        start_date='2014-06-01',
        end_date='2024-05-31',
        lookback_period=12
    )
    
    # Print results
    print_monthly_sector_author_performance(results)
    
    return results

In [32]:
# Load the filtered DataFrame from the CSV file
df = pd.read_csv('data/cumulative_performance.csv')
df = df.dropna(subset=['cumulative_expected_return', 'cumulative_actual_return'])


# Display the first few rows of the DataFrame to verify it loaded correctly
print(df.head())

   Unnamed: 0        date                    Company Name      Symbol  \
0           0  2015-11-16  speciality restaurants limited  SPECIALITY   
1           1  2015-08-13  speciality restaurants limited  SPECIALITY   
2          36  2022-11-21           eicher motors limited   EICHERMOT   
3          37  2022-11-15           eicher motors limited   EICHERMOT   
4          39  2022-11-11           eicher motors limited   EICHERMOT   

                     author         Sector  expected_return  actual_return  \
0                     Karvy     Technology         9.432421      -0.018033   
1       Reliance Securities     Technology         7.189542       0.018065   
2  BOB Capital Markets Ltd.  Manufacturing         4.865921       0.018030   
3        Geojit BNP Paribas  Manufacturing        21.200501      -0.039829   
4             Motilal Oswal  Manufacturing        17.909451      -0.041558   

   cumulative_expected_return  cumulative_actual_return  
0                    9.432421     

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27656 entries, 0 to 27655
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0                  27656 non-null  int64  
 1   date                        27656 non-null  object 
 2   Company Name                27656 non-null  object 
 3   Symbol                      27656 non-null  object 
 4   author                      27656 non-null  object 
 5   Sector                      27656 non-null  object 
 6   expected_return             27656 non-null  float64
 7   actual_return               27656 non-null  float64
 8   cumulative_expected_return  27656 non-null  float64
 9   cumulative_actual_return    27656 non-null  float64
dtypes: float64(4), int64(1), object(5)
memory usage: 2.1+ MB


In [34]:
main(df)

  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stdd


=== Month: 2014-06 ===

Sector: Consumer Goods
  Author: Chola Wealth Direct            Average Performance: -0.004 Number of Predictions:   1

Sector: Energy
  Author: Chola Wealth Direct            Average Performance:  0.085 Number of Predictions:   1

Sector: Finance
  Author: Chola Wealth Direct            Average Performance:  0.139 Number of Predictions:   1

Sector: Healthcare
  Author: Chola Wealth Direct            Average Performance:  0.012 Number of Predictions:   1

=== Month: 2014-07 ===

Sector: Finance
  Author: Chola Wealth Direct            Average Performance:  0.011 Number of Predictions:   7

Sector: Manufacturing
  Author: Chola Wealth Direct            Average Performance: -0.035 Number of Predictions:   3

Sector: Technology
  Author: Chola Wealth Direct            Average Performance:  0.037 Number of Predictions:   1

=== Month: 2014-08 ===

Sector: Consumer Goods
  Author: Chola Wealth Direct            Average Performance:  0.027 Number of Predictions:   2

Unnamed: 0,Sector,author,mean,count,month
0,Consumer Goods,Chola Wealth Direct,-0.003993,1,2014-06-01
1,Energy,Chola Wealth Direct,0.085451,1,2014-06-01
2,Finance,Chola Wealth Direct,0.138710,1,2014-06-01
3,Healthcare,Chola Wealth Direct,0.011628,1,2014-06-01
4,Finance,Chola Wealth Direct,0.010596,7,2014-07-01
...,...,...,...,...,...
2469,Transportation,Motilal Oswal,0.009701,1,2022-12-01
2470,Unknown,Motilal Oswal,0.007035,2,2022-12-01
2471,Unknown,Prabhudas Lilladhar,0.014131,1,2022-12-01
2472,Utilities,ICICI Direct,0.016698,1,2022-12-01


In [43]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# [Previous functions remain the same: get_best_authors_per_sector, calculate_monthly_performance, run_rolling_analysis, print_monthly_sector_author_performance]

def save_results_to_csv(results_df, base_filename="cumulative_performance"):
    """
    Save results to CSV files - both detailed and summary formats
    """
    # Save detailed results
    detailed_filename = f"{base_filename}_detailed.csv"
    results_df.to_csv(detailed_filename, index=False)
    print(f"\nDetailed results saved to: {detailed_filename}")
    
    # Create and save summary results
    summary_df = (results_df.groupby(['month', 'Sector', 'author'])
                 .agg({
                     'mean': 'mean',
                     'count': 'sum'
                 })
                 .reset_index()
                 .sort_values(['month', 'Sector', 'mean'], ascending=[True, True, False]))
    
    # Add cumulative performance metrics
    summary_df['cumulative_performance'] = summary_df.groupby(['author', 'Sector'])['mean'].cumsum()
    summary_df['avg_monthly_predictions'] = summary_df.groupby(['author', 'Sector'])['count'].transform('mean')
    
    summary_filename = f"{base_filename}_summary.csv"
    summary_df.to_csv(summary_filename, index=False)
    print(f"Summary results saved to: {summary_filename}")
    
    return detailed_filename, summary_filename

def main(df, base_filename="cumulative_performance"):
    # Convert date column to datetime if it's not already
    df['date'] = pd.to_datetime(df['date'])
    
    # Run the analysis
    results = run_rolling_analysis(
        df,
        start_date='2014-06-01',
        end_date='2024-05-31',
        lookback_period=12
    )
    
    # Print results to console
    print_monthly_sector_author_performance(results)
    
    # Save results to CSV files
    detailed_file, summary_file = save_results_to_csv(results, base_filename)
    
    return results, detailed_file, summary_file

In [44]:
results, detailed_file, summary_file = main(df)

  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stdd


=== Month: 2014-06 ===

Sector: Consumer Goods
  Author: Chola Wealth Direct            Average Performance: -0.004 Number of Predictions:   1

Sector: Energy
  Author: Chola Wealth Direct            Average Performance:  0.085 Number of Predictions:   1

Sector: Finance
  Author: Chola Wealth Direct            Average Performance:  0.139 Number of Predictions:   1

Sector: Healthcare
  Author: Chola Wealth Direct            Average Performance:  0.012 Number of Predictions:   1

=== Month: 2014-07 ===

Sector: Finance
  Author: Chola Wealth Direct            Average Performance:  0.011 Number of Predictions:   7

Sector: Manufacturing
  Author: Chola Wealth Direct            Average Performance: -0.035 Number of Predictions:   3

Sector: Technology
  Author: Chola Wealth Direct            Average Performance:  0.037 Number of Predictions:   1

=== Month: 2014-08 ===

Sector: Consumer Goods
  Author: Chola Wealth Direct            Average Performance:  0.027 Number of Predictions:   2

In [45]:
detailed_file

'cumulative_performance_detailed.csv'

In [49]:
cum_df = pd.read_csv('cumulative_performance_summary.csv')

In [52]:
cum_df['cumulative_performance'].mean()

0.03429635130742994

In [54]:
cum_df.describe()

Unnamed: 0,mean,count,cumulative_performance,avg_monthly_predictions
count,2474.0,2474.0,2474.0,2474.0
mean,0.003059,3.578416,0.034296,3.578416
std,0.08871,4.701923,0.156445,3.335923
min,-1.433173,1.0,-1.443296,1.0
25%,-0.019887,1.0,-0.029086,1.428571
50%,0.000828,2.0,0.018266,2.25
75%,0.020129,4.0,0.087766,4.25
max,3.538614,42.0,3.685451,23.333333


In [56]:
cum_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2474 entries, 0 to 2473
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   month                    2474 non-null   object 
 1   Sector                   2474 non-null   object 
 2   author                   2474 non-null   object 
 3   mean                     2474 non-null   float64
 4   count                    2474 non-null   int64  
 5   cumulative_performance   2474 non-null   float64
 6   avg_monthly_predictions  2474 non-null   float64
dtypes: float64(3), int64(1), object(3)
memory usage: 135.4+ KB


In [58]:
grouped = cum_df.groupby('month')['mean'].sum().reset_index()
df[:2474]['mean'].mean() # to consider only until 2022 end

KeyError: 3