In [1]:
from datetime import timedelta
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
# Load the filtered DataFrame from the CSV file
# Define the file path
file_path = r'C:\Users\disch\Desktop\CiteSert\Project_1\Version2_40krows_n5\input_data\filtered_data_2.csv'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

df = df.dropna(subset=['expected_return', 'actual_return'])


# Display the first few rows of the DataFrame to verify it loaded correctly
print(df.head())

          date                    Company Name      Symbol  \
0   2015-11-16  speciality restaurants limited  SPECIALITY   
1   2015-08-13  speciality restaurants limited  SPECIALITY   
8   2024-05-17           eicher motors limited   EICHERMOT   
9   2024-05-13           eicher motors limited   EICHERMOT   
11  2024-05-13           eicher motors limited   EICHERMOT   

                      author         Sector  expected_return  actual_return  
0                      Karvy     Technology         9.432421      -0.018033  
1        Reliance Securities     Technology         7.189542       0.018065  
8   BOB Capital Markets Ltd.  Manufacturing        -1.297429       0.030817  
9               ICICI Direct  Manufacturing        18.090371       0.001189  
11               Axis Direct  Manufacturing        10.253465       0.001189  


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 35107 entries, 0 to 41819
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   date             35107 non-null  object 
 1   Company Name     35107 non-null  object 
 2   Symbol           35107 non-null  object 
 3   author           35107 non-null  object 
 4   Sector           35107 non-null  object 
 5   expected_return  35107 non-null  float64
 6   actual_return    35107 non-null  float64
dtypes: float64(2), object(5)
memory usage: 2.1+ MB


In [4]:
def calculate_correlation(expected_returns, actual_returns):
    """
    Calculate the correlation between expected and actual returns,
    ensuring no division by zero occurs.
    
    Args:
        expected_returns (pd.Series): Expected returns.
        actual_returns (pd.Series): Actual returns.

    Returns:
        float: Correlation coefficient, or NaN if not calculable.
    """
    # Check standard deviation to avoid division by zero
    if np.std(expected_returns) == 0 or np.std(actual_returns) == 0:
        return np.nan
    
    # Calculate correlation
    return np.corrcoef(expected_returns, actual_returns)[0, 1]

In [5]:
def get_best_authors_by_sector(df, start_date, end_date, corr_threshold, perf_threshold, max_authors=10):
    """
    Identify the best-performing authors per sector within a specific date range based on 
    correlation and mean performance thresholds.
    """
    print("\n### get_best_authors_by_sector ###")
    print(f"Start Date: {start_date}, End Date: {end_date}")
    print(f"Max Authors per Sector: {max_authors}, Correlation Threshold: {corr_threshold}, Performance Threshold: {perf_threshold}")
    
    # Convert date column to datetime if necessary
    df['date'] = pd.to_datetime(df['date'])
    
    # Filter data for the specified date range
    period_df = df[(df['date'] >= start_date) & (df['date'] <= end_date)].copy()
    print(f"Filtered Data Shape: {period_df.shape}")

    if period_df.empty:
        print("No data available for the specified date range.")
        return pd.DataFrame(columns=['Sector', 'author', 'correlation', 'mean_performance'])

    # Calculate performance metrics for each sector-author pair
    author_sector_performance = []
    grouped = period_df.groupby(['Sector', 'author'])
    print(f"Number of Sector-Author Groups: {len(grouped)}")

    for (sector, author), group in grouped:
        # print(f"\nProcessing Sector: {sector}, Author: {author}")
        # print(f"Group Size: {len(group)}")

        if len(group) < 2:  # Ensure enough data points for correlation calculation
            print("Not enough data points for correlation calculation. Skipping.")
            continue

        # Compute correlation between expected and actual returns
        correlation = calculate_correlation(group['expected_return'], group['actual_return'])
        # print(f"Correlation: {correlation}")

        # Compute mean performance
        group['performance'] = np.sign(group['expected_return']) * group['actual_return']
        mean_performance = group['performance'].mean()
        # print(f"Mean Performance: {mean_performance}")

        # Store results if thresholds are met
        if not np.isnan(correlation) and correlation > corr_threshold and mean_performance > perf_threshold:
            print("Thresholds met. Adding to results.")
            author_sector_performance.append({
                'Sector': sector,
                'author': author,
                'correlation': correlation,
                'mean_performance': mean_performance
            })
        else:
            print("Thresholds not met. Skipping.")

    # Create a DataFrame from the results
    performance_df = pd.DataFrame(author_sector_performance)
    print(f"\nPerformance DataFrame Shape: {performance_df.shape}")

    if performance_df.empty:
        print("No authors met the thresholds.")
        return pd.DataFrame(columns=['Sector', 'author', 'correlation', 'mean_performance'])

    # Sort authors by correlation and mean performance, and retain top authors per sector
    print("Sorting and selecting top authors by sector.")
    top_authors_df = (performance_df
                      .sort_values(['correlation', 'mean_performance'], ascending=[False, False])
                      .groupby('Sector')
                      .head(max_authors)
                      .reset_index(drop=True))
    print(f"Top Authors DataFrame Shape: {top_authors_df.shape}")
    print(f"Top Authors DataFrame Sample:\n{top_authors_df.head()}")

    return top_authors_df

In [6]:
def calculate_monthly_performance(df, best_authors_df, target_month):
    """
    Calculate performance for each author-sector pair for a specific month
    using sign(expected_return) * actual_return
    """
    print(f"\nCalculating monthly performance for: {target_month.strftime('%Y-%m')}")

    # Define month range
    month_start = target_month.replace(day=1)
    month_end = (month_start + pd.offsets.MonthEnd(1))
    print(f"Month Start: {month_start}, Month End: {month_end}")

    # Filter data for the target month
    mask = (df['date'] >= month_start) & (df['date'] <= month_end)
    month_df = df[mask].copy()
    print(f"Filtered Data for Month: {month_df.shape[0]} rows")

    # Merge with best authors
    print(f"Best Authors DataFrame Shape: {best_authors_df.shape}")
    month_df = month_df.merge(
        best_authors_df[['Sector', 'author']],
        on=['Sector', 'author'],
        how='inner'
    )
    print(f"Data After Merging with Best Authors: {month_df.shape[0]} rows")

    # Calculate performance
    month_df['performance'] = np.sign(month_df['expected_return']) * month_df['actual_return']
    print(f"Performance Column Calculated. Sample Data:\n{month_df[['Sector', 'author', 'performance']].head()}")

    # Group and summarize performance
    performance_summary = (month_df.groupby(['Sector', 'author'])
                           ['performance']
                           .agg(mean='mean', count='count')
                           .reset_index())
    performance_summary['month'] = month_start
    print(f"Performance Summary for {target_month.strftime('%Y-%m')}:\n{performance_summary.head()}")

    return performance_summary

def run_rolling_analysis(df, start_date, end_date, lookback_period=12, a=0.5, b=0.5):
    """
    Run the rolling analysis month by month
    """
    print("\nStarting Rolling Analysis")
    print(f"Start Date: {start_date}, End Date: {end_date}, Lookback Period: {lookback_period} months")
    print(f"Thresholds: Correlation (a={a}), Performance (b={b})")

    # Convert dates if they're strings
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)

    all_monthly_performance = []
    current_date = start_date

    while current_date <= end_date:
        print(f"\nProcessing Month: {current_date.strftime('%Y-%m')}")
        
        # Define training period
        training_end = current_date - timedelta(days=1)
        training_start = training_end - pd.DateOffset(months=lookback_period)
        print(f"Training Period: Start {training_start}, End {training_end}")

        # Get best authors
        best_authors = get_best_authors_by_sector(df, training_start, training_end, a, b)
        print(f"Best Authors DataFrame Shape: {best_authors.shape}")

        # Calculate performance for the current month
        if not best_authors.empty:
            monthly_perf = calculate_monthly_performance(df, best_authors, current_date)
            all_monthly_performance.append(monthly_perf)

            print(f"Monthly Performance for {current_date.strftime('%Y-%m')}:\n", monthly_perf.head())
        else:
            print(f"No Best Authors Found for {current_date.strftime('%Y-%m')}")

        # Move to next month
        current_date = current_date + pd.DateOffset(months=1)

    # Combine all results
    if all_monthly_performance:
        combined_results = pd.concat(all_monthly_performance, ignore_index=True)
        print(f"\nCombined Performance DataFrame Shape: {combined_results.shape}")
        return combined_results
    else:
        print("No performance data collected.")
        return pd.DataFrame()


In [7]:
def main(df):
    # Convert date column to datetime if it's not already
    df['date'] = pd.to_datetime(df['date'])
    
    # Define the grid for correlation and performance thresholds
    correlation_thresholds = np.arange(0, 1.0, 0.1)
    performance_thresholds = np.arange(0, 1.0, 0.1)
    
    # Initialize a dictionary to store results for each iteration
    all_results = {}
    iteration_results = []
    iteration = 0

    # Iterate over the grid of thresholds
    for a in tqdm(correlation_thresholds, desc="Correlation Thresholds"):
        for b in tqdm(performance_thresholds, desc="Performance Thresholds", leave=False):
            mean_performance_values = []
            
            # Run the rolling analysis
            results_df = run_rolling_analysis(
                df,
                start_date='2014-06-01',
                end_date='2022-12-31',
                lookback_period=12,
                a=a,
                b=b
            )
                
            # Calculate mean performance for the iteration
            mean_of_mean_performance = results_df['mean'].mean()
            
            # Save the mean performance value
            mean_performance_values.append(mean_of_mean_performance)
            
            # Save the results dataframe for the iteration
            iteration_key = f"iter_{iteration}_a_{a}_b_{b}"
            all_results[iteration_key] = results_df
            
            # Increment iteration counter
            iteration += 1

            
            print(f"Iteration {iteration}: Current Mean of Mean Performance = {mean_of_mean_performance:.4f}")
        
        # Store mean performance across all iterations for this grid combination
        avg_performance = np.nanmean(mean_performance_values)
        iteration_results.append({
            'correlation_threshold': a,
            'performance_threshold': b,
            'average_performance': avg_performance
        })

    # Convert iteration results to a DataFrame for analysis
    grid_performance_df = pd.DataFrame(iteration_results)
    
    return all_results, grid_performance_df


In [None]:
all_results, grid_performance_df = main(df)

Correlation Thresholds:   0%|                                                                   | 0/10 [00:00<?, ?it/s]
Performance Thresholds:   0%|                                                                   | 0/10 [00:00<?, ?it/s][A


Starting Rolling Analysis
Start Date: 2014-06-01, End Date: 2022-12-31, Lookback Period: 12 months
Thresholds: Correlation (a=0.0), Performance (b=0.0)

Processing Month: 2014-06
Training Period: Start 2013-05-31 00:00:00, End 2014-05-31 00:00:00

### get_best_authors_by_sector ###
Start Date: 2013-05-31 00:00:00, End Date: 2014-05-31 00:00:00
Max Authors per Sector: 10, Correlation Threshold: 0.0, Performance Threshold: 0.0
Filtered Data Shape: (129, 7)
Number of Sector-Author Groups: 24
Thresholds not met. Skipping.
Thresholds not met. Skipping.
Thresholds met. Adding to results.
Not enough data points for correlation calculation. Skipping.
Not enough data points for correlation calculation. Skipping.
Thresholds not met. Skipping.
Not enough data points for correlation calculation. Skipping.
Thresholds met. Adding to results.
Thresholds not met. Skipping.
Not enough data points for correlation calculation. Skipping.
Not enough data points for correlation calculation. Skipping.
Not e