In [3]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import tqdm


def get_best_authors_per_sector(df, start_date, end_date, max_authors=10):
    """
    Find the best performing authors per sector based on correlation between
    expected and actual returns within the specified date range.
    """
    # Filter data for the specified period
    mask = (df['date'] >= start_date) & (df['date'] <= end_date)
    period_df = df[mask].copy()
    
    # Calculate correlation for each author-sector pair
    author_sector_performance = []
    
    for (sector, author), group in period_df.groupby(['Sector', 'author']):
        if len(group) < 2:  # Skip if not enough data points
            continue
            
        correlation = np.corrcoef(group['expected_return'], group['actual_return'])[0, 1]
        
        if not np.isnan(correlation) and correlation > 0:
            author_sector_performance.append({
                'Sector': sector,
                'author': author,
                'correlation': correlation
            })
    
    # Convert to DataFrame and get top authors per sector
    performance_df = pd.DataFrame(author_sector_performance)
    if len(performance_df) == 0:
        return pd.DataFrame(columns=['Sector', 'author', 'correlation'])
        
    return (performance_df.sort_values('correlation', ascending=False)
            .groupby('Sector')
            .head(max_authors)
            .reset_index(drop=True))

def calculate_monthly_performance(df, best_authors_df, target_month):
    """
    Calculate performance for each author-sector pair for a specific month
    using sign(expected_return) * actual_return
    """
    # Filter data for the target month and best authors
    month_start = target_month.replace(day=1)
    month_end = (month_start + pd.offsets.MonthEnd(1))
    
    mask = (df['date'] >= month_start) & (df['date'] <= month_end)
    month_df = df[mask].copy()
    
    # Filter for only the best author-sector pairs
    month_df = month_df.merge(
        best_authors_df[['Sector', 'author']],
        on=['Sector', 'author'],
        how='inner'
    )
    
    # Calculate performance
    month_df['performance'] = np.sign(month_df['expected_return']) * month_df['actual_return']
    
    # Calculate average performance per author-sector
    performance_summary = (month_df.groupby(['Sector', 'author'])
                         ['performance']
                         .agg(['mean', 'count'])
                         .reset_index())
    
    performance_summary['month'] = month_start
    return performance_summary

def run_rolling_analysis(df, start_date, end_date, lookback_period=12):
    """
    Run the rolling analysis month by month
    """
    # Convert dates if they're strings
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)
    
    all_monthly_performance = []
    
    current_date = start_date
    while current_date <= end_date:
        # Calculate training period
        training_end = current_date - timedelta(days=1)
        training_start = training_end - pd.DateOffset(months=lookback_period)
        
        # Get best authors based on the training period
        best_authors = get_best_authors_per_sector(
            df,
            training_start,
            training_end
        )
        
        # Calculate performance for the current month
        if not best_authors.empty:
            monthly_perf = calculate_monthly_performance(
                df,
                best_authors,
                current_date
            )
            all_monthly_performance.append(monthly_perf)
        
        # Move to next month
        current_date = current_date + pd.DateOffset(months=1)
    
    # Combine all results
    if all_monthly_performance:
        return pd.concat(all_monthly_performance, ignore_index=True)
    return pd.DataFrame()

def print_monthly_sector_author_performance(results_df):
    """
    Print formatted results showing performance by month, sector, and author
    """
    # Sort by month, sector, and mean performance
    sorted_results = results_df.sort_values(
        ['month', 'Sector', 'mean'],
        ascending=[True, True, False]
    )
    
    # Format and print results
    current_month = None
    current_sector = None
    
    for _, row in sorted_results.iterrows():
        month_str = row['month'].strftime('%Y-%m')
        
        if month_str != current_month:
            print(f"\n=== Month: {month_str} ===")
            current_month = month_str
            current_sector = None
        
        if row['Sector'] != current_sector:
            print(f"\nSector: {row['Sector']}")
            current_sector = row['Sector']
        
        print(f"  Author: {row['author']:<30} "
              f"Average Performance: {row['mean']:6.3f} "
              f"Number of Predictions: {row['count']:3d}")

# Example usage
def main(df):
    # Convert date column to datetime if it's not already
    df['date'] = pd.to_datetime(df['date'])
    
    # Run the analysis
    results = run_rolling_analysis(
        df,
        start_date='2014-06-01',
        end_date='2024-05-31',
        lookback_period=12
    )
    
    # Print results
    print_monthly_sector_author_performance(results)
    
    return results

In [4]:
# Load the filtered DataFrame from the CSV file
df = pd.read_csv('data/filtered_data_2_n1.csv')
df = df.dropna(subset=['expected_return', 'actual_return'])


# Display the first few rows of the DataFrame to verify it loaded correctly
print(df.head())

          date                    Company Name      Symbol  \
0   2015-11-16  speciality restaurants limited  SPECIALITY   
1   2015-08-13  speciality restaurants limited  SPECIALITY   
8   2024-05-17           eicher motors limited   EICHERMOT   
9   2024-05-13           eicher motors limited   EICHERMOT   
11  2024-05-13           eicher motors limited   EICHERMOT   

                      author         Sector  expected_return  actual_return  
0                      Karvy     Technology         9.432421       0.025820  
1        Reliance Securities     Technology         7.189542      -0.013226  
8   BOB Capital Markets Ltd.  Manufacturing        -1.297429       0.004935  
9               ICICI Direct  Manufacturing        18.090371       0.013530  
11               Axis Direct  Manufacturing        10.253465       0.013530  


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 35107 entries, 0 to 41819
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   date             35107 non-null  object 
 1   Company Name     35107 non-null  object 
 2   Symbol           35107 non-null  object 
 3   author           35107 non-null  object 
 4   Sector           35107 non-null  object 
 5   expected_return  35107 non-null  float64
 6   actual_return    35107 non-null  float64
dtypes: float64(2), object(5)
memory usage: 2.1+ MB


In [6]:
main(df)

  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stdd


=== Month: 2014-06 ===

Sector: Agriculture
  Author: Chola Wealth Direct            Average Performance:  0.006 Number of Predictions:   1

Sector: Energy
  Author: Chola Wealth Direct            Average Performance: -0.008 Number of Predictions:   1

Sector: Finance
  Author: Chola Wealth Direct            Average Performance:  0.058 Number of Predictions:   1

Sector: Healthcare
  Author: Chola Wealth Direct            Average Performance: -0.010 Number of Predictions:   1

=== Month: 2014-07 ===

Sector: Finance
  Author: Chola Wealth Direct            Average Performance:  0.005 Number of Predictions:   7

Sector: Healthcare
  Author: Reliance Securities            Average Performance: -0.039 Number of Predictions:   1

Sector: Technology
  Author: Chola Wealth Direct            Average Performance: -0.026 Number of Predictions:   1

=== Month: 2014-08 ===

Sector: Energy
  Author: Chola Wealth Direct            Average Performance:  0.011 Number of Predictions:   1

Sector: Fina

Unnamed: 0,Sector,author,mean,count,month
0,Agriculture,Chola Wealth Direct,0.005850,1,2014-06-01
1,Energy,Chola Wealth Direct,-0.007949,1,2014-06-01
2,Finance,Chola Wealth Direct,0.058065,1,2014-06-01
3,Healthcare,Chola Wealth Direct,-0.009884,1,2014-06-01
4,Finance,Chola Wealth Direct,0.004945,7,2014-07-01
...,...,...,...,...,...
3157,Transportation,ICICI Securities Limited,-0.009449,1,2024-05-01
3158,Unknown,BOB Capital Markets Ltd.,-0.006660,3,2024-05-01
3159,Unknown,ICICI Securities Limited,0.021709,1,2024-05-01
3160,Utilities,Axis Direct,0.018544,1,2024-05-01


In [11]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# [Previous functions remain the same: get_best_authors_per_sector, calculate_monthly_performance, run_rolling_analysis, print_monthly_sector_author_performance]

def save_results_to_csv(results_df, base_filename="cumulative_performance"):
    """
    Save results to CSV files - both detailed and summary formats
    """
    # Save detailed results
    detailed_filename = f"{base_filename}_detailed.csv"
    results_df.to_csv(detailed_filename, index=False)
    print(f"\nDetailed results saved to: {detailed_filename}")
    
    # Create and save summary results
    summary_df = (results_df.groupby(['month', 'Sector', 'author'])
                 .agg({
                     'mean': 'mean',
                     'count': 'sum'
                 })
                 .reset_index()
                 .sort_values(['month', 'Sector', 'mean'], ascending=[True, True, False]))
    
    # Add cumulative performance metrics
    summary_df['cumulative_performance'] = summary_df.groupby(['author', 'Sector'])['mean'].cumsum()
    summary_df['avg_monthly_predictions'] = summary_df.groupby(['author', 'Sector'])['count'].transform('mean')
    
    summary_filename = f"{base_filename}_summary.csv"
    summary_df.to_csv(summary_filename, index=False)
    print(f"Summary results saved to: {summary_filename}")
    
    return detailed_filename, summary_filename

def main(df, base_filename="performance_n1"):
    # Convert date column to datetime if it's not already
    df['date'] = pd.to_datetime(df['date'])
    
    # Run the analysis
    results = run_rolling_analysis(
        df,
        start_date='2014-06-01',
        end_date='2024-05-31',
        lookback_period=12
    )
    
    # Print results to console
    print_monthly_sector_author_performance(results)
    
    # Save results to CSV files
    detailed_file, summary_file = save_results_to_csv(results, base_filename)
    
    return results, detailed_file, summary_file

In [12]:
results, detailed_file, summary_file = main(df)

  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stdd


=== Month: 2014-06 ===

Sector: Agriculture
  Author: Chola Wealth Direct            Average Performance:  0.006 Number of Predictions:   1

Sector: Energy
  Author: Chola Wealth Direct            Average Performance: -0.008 Number of Predictions:   1

Sector: Finance
  Author: Chola Wealth Direct            Average Performance:  0.058 Number of Predictions:   1

Sector: Healthcare
  Author: Chola Wealth Direct            Average Performance: -0.010 Number of Predictions:   1

=== Month: 2014-07 ===

Sector: Finance
  Author: Chola Wealth Direct            Average Performance:  0.005 Number of Predictions:   7

Sector: Healthcare
  Author: Reliance Securities            Average Performance: -0.039 Number of Predictions:   1

Sector: Technology
  Author: Chola Wealth Direct            Average Performance: -0.026 Number of Predictions:   1

=== Month: 2014-08 ===

Sector: Energy
  Author: Chola Wealth Direct            Average Performance:  0.011 Number of Predictions:   1

Sector: Fina

In [13]:
detailed_file

'performance_n1_detailed.csv'

In [15]:
df = pd.read_csv("performance_n1_detailed.csv")

In [16]:
df.head()

Unnamed: 0,Sector,author,mean,count,month
0,Agriculture,Chola Wealth Direct,0.00585,1,2014-06-01
1,Energy,Chola Wealth Direct,-0.007949,1,2014-06-01
2,Finance,Chola Wealth Direct,0.058065,1,2014-06-01
3,Healthcare,Chola Wealth Direct,-0.009884,1,2014-06-01
4,Finance,Chola Wealth Direct,0.004945,7,2014-07-01


In [17]:
df.describe()

Unnamed: 0,mean,count
count,3162.0,3162.0
mean,-0.00076,3.685642
std,0.019541,4.857395
min,-0.143936,1.0
25%,-0.010507,1.0
50%,-0.001379,2.0
75%,0.007938,4.0
max,0.174286,47.0
