In [1]:
from datetime import timedelta
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
# Load the filtered DataFrame from the CSV file
# Define the file path
file_path = r'C:\Users\disch\Desktop\CiteSert\Project_1\Version2_40krows_n5\input_data\filtered_data_2.csv'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

df = df.dropna(subset=['expected_return', 'actual_return'])


# Display the first few rows of the DataFrame to verify it loaded correctly
print(df.head())

          date                    Company Name      Symbol  \
0   2015-11-16  speciality restaurants limited  SPECIALITY   
1   2015-08-13  speciality restaurants limited  SPECIALITY   
8   2024-05-17           eicher motors limited   EICHERMOT   
9   2024-05-13           eicher motors limited   EICHERMOT   
11  2024-05-13           eicher motors limited   EICHERMOT   

                      author         Sector  expected_return  actual_return  
0                      Karvy     Technology         9.432421      -0.018033  
1        Reliance Securities     Technology         7.189542       0.018065  
8   BOB Capital Markets Ltd.  Manufacturing        -1.297429       0.030817  
9               ICICI Direct  Manufacturing        18.090371       0.001189  
11               Axis Direct  Manufacturing        10.253465       0.001189  


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 35107 entries, 0 to 41819
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   date             35107 non-null  object 
 1   Company Name     35107 non-null  object 
 2   Symbol           35107 non-null  object 
 3   author           35107 non-null  object 
 4   Sector           35107 non-null  object 
 5   expected_return  35107 non-null  float64
 6   actual_return    35107 non-null  float64
dtypes: float64(2), object(5)
memory usage: 2.1+ MB


In [4]:
def calculate_correlation(expected_returns, actual_returns):
    """
    Calculate the correlation between expected and actual returns,
    ensuring no division by zero occurs.
    
    Args:
        expected_returns (pd.Series): Expected returns.
        actual_returns (pd.Series): Actual returns.

    Returns:
        float: Correlation coefficient, or NaN if not calculable.
    """
    # Check standard deviation to avoid division by zero
    if np.std(expected_returns) == 0 or np.std(actual_returns) == 0:
        return np.nan
    
    # Calculate correlation
    return np.corrcoef(expected_returns, actual_returns)[0, 1]

In [5]:
def get_best_authors_by_sector(df, start_date, end_date, corr_threshold, perf_threshold, max_authors=10):
    """
    Identify the best-performing authors per sector within a specific date range based on 
    correlation and mean performance thresholds.
    """
    print("\n### get_best_authors_by_sector ###")
    print(f"Start Date: {start_date}, End Date: {end_date}")
    print(f"Max Authors per Sector: {max_authors}, Correlation Threshold: {corr_threshold}, Performance Threshold: {perf_threshold}")
    
    # Convert date column to datetime if necessary
    df['date'] = pd.to_datetime(df['date'])
    
    # Filter data for the specified date range
    period_df = df[(df['date'] >= start_date) & (df['date'] <= end_date)].copy()
    # print(f"Filtered Data Shape: {period_df.shape}")

    if period_df.empty:
        print("No data available for the specified date range.")
        return pd.DataFrame(columns=['Sector', 'author', 'correlation', 'mean_performance'])

    # Calculate performance metrics for each sector-author pair
    author_sector_performance = []
    grouped = period_df.groupby(['Sector', 'author'])
    # print(f"Number of Sector-Author Groups: {len(grouped)}")

    for (sector, author), group in grouped:
        # print(f"\nProcessing Sector: {sector}, Author: {author}")
        # print(f"Group Size: {len(group)}")

        if len(group) < 2:  # Ensure enough data points for correlation calculation
            # print("Not enough data points for correlation calculation. Skipping.")
            continue

        # Compute correlation between expected and actual returns
        correlation = calculate_correlation(group['expected_return'], group['actual_return'])
        # print(f"Correlation: {correlation}")

        # Compute mean performance
        group['performance'] = np.sign(group['expected_return']) * group['actual_return']
        mean_performance = group['performance'].mean()
        # print(f"Mean Performance: {mean_performance}")

        # Store results if thresholds are met
        if not np.isnan(correlation) and correlation > corr_threshold and mean_performance > perf_threshold:
            # print("Thresholds met. Adding to results.")
            author_sector_performance.append({
                'Sector': sector,
                'author': author,
                'correlation': correlation,
                'mean_performance': mean_performance
            })
        else:
            continue
            # print("Thresholds not met. Skipping.")

    # Create a DataFrame from the results
    performance_df = pd.DataFrame(author_sector_performance)
    # print(f"\nPerformance DataFrame Shape: {performance_df.shape}")

    if performance_df.empty:
        print("No authors met the thresholds.")
        return pd.DataFrame(columns=['Sector', 'author', 'correlation', 'mean_performance'])

    # Sort authors by correlation and mean performance, and retain top authors per sector
    # print("Sorting and selecting top authors by sector.")
    top_authors_df = (performance_df
                      .sort_values(['correlation', 'mean_performance'], ascending=[False, False])
                      .groupby('Sector')
                      .head(max_authors)
                      .reset_index(drop=True))
    # print(f"Top Authors DataFrame Shape: {top_authors_df.shape}")
    # print(f"Top Authors DataFrame Sample:\n{top_authors_df}")

    return top_authors_df

In [6]:
def calculate_monthly_performance(df, best_authors_df, target_month):
    """
    Calculate performance for each author-sector pair for a specific month
    using sign(expected_return) * actual_return
    """
    print(f"\nCalculating monthly performance for: {target_month.strftime('%Y-%m')}")

    # Define month range
    month_start = target_month.replace(day=1)
    month_end = (month_start + pd.offsets.MonthEnd(1))
    # print(f"Month Start: {month_start}, Month End: {month_end}")

    # Filter data for the target month
    mask = (df['date'] >= month_start) & (df['date'] <= month_end)
    month_df = df[mask].copy()
    # print(f"Filtered Data for Month: {month_df.shape[0]} rows")

    # Merge with best authors
    # print(f"Best Authors DataFrame Shape: {best_authors_df.shape}")
    month_df = month_df.merge(
        best_authors_df[['Sector', 'author']],
        on=['Sector', 'author'],
        how='inner'
    )
    # print(f"Data After Merging with Best Authors: {month_df.shape[0]} rows")

    # Calculate performance
    month_df['performance'] = np.sign(month_df['expected_return']) * month_df['actual_return']
    # print(f"Performance Column Calculated. Sample Data:\n{month_df[['Sector', 'author', 'performance']].head()}")

    # Group and summarize performance
    performance_summary = (month_df.groupby(['Sector', 'author'])
                           ['performance']
                           .agg(mean_performance='mean', count='count')
                           .reset_index())
    performance_summary['month'] = month_start
    # print(f"Performance Summary for {target_month.strftime('%Y-%m')}:\n{performance_summary.head()}")

    return performance_summary

def run_rolling_analysis(df, start_date, end_date, a, b, lookback_period=12):
    """
    Run the rolling analysis month by month
    """
    # print("\nStarting Rolling Analysis")
    # print(f"Start Date: {start_date}, End Date: {end_date}, Lookback Period: {lookback_period} months")
    # print(f"Thresholds: Correlation (a={a}), Performance (b={b})")

    # Convert dates if they're strings
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)

    all_monthly_performance = []
    current_date = start_date

    while current_date <= end_date:
        # print(f"\nProcessing Month: {current_date.strftime('%Y-%m')}")
        
        # Define training period
        training_end = current_date - timedelta(days=1)
        training_start = training_end - pd.DateOffset(months=lookback_period)
        # print(f"Training Period: Start {training_start}, End {training_end}")

        # Get best authors
        best_authors = get_best_authors_by_sector(df, training_start, training_end, a, b)
        # print(f"Best Authors DataFrame Shape: {best_authors.shape}")

        # Calculate performance for the current month
        if not best_authors.empty:
            monthly_perf = calculate_monthly_performance(df, best_authors, current_date)
            all_monthly_performance.append(monthly_perf)

            print(f"Monthly Performance for {current_date.strftime('%Y-%m')}:\n", monthly_perf.head())
        else:
            print(f"No Best Authors Found for {current_date.strftime('%Y-%m')}")

        # Move to next month
        current_date = current_date + pd.DateOffset(months=1)

    # Combine all results
    if all_monthly_performance:
        combined_results = pd.concat(all_monthly_performance, ignore_index=True)
        # print(f"\nCombined Performance DataFrame Shape: {combined_results.shape}")
        return combined_results
    else:
        print("No performance data collected.")
        return pd.DataFrame()


In [7]:
def main(df):
    """
    Main function to analyze performance based on correlation and performance thresholds
    using a grid search approach.
    """
    print("\n### MAIN FUNCTION STARTED ###")

    # Convert date column to datetime if it's not already
    print("\nConverting 'date' column to datetime format.")
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    
    # Define the grid for correlation and performance thresholds
    print("\nDefining thresholds grid for correlation and performance.")
    correlation_thresholds = np.arange(0, 1.0, 0.05)
    performance_thresholds = np.arange(0, 0.3, 0.05)
    print(f"Correlation Thresholds: {correlation_thresholds}")
    print(f"Performance Thresholds: {performance_thresholds}")
    
    # Initialize a dictionary to store results for each iteration
    all_results = {}
    iteration_results = []
    iteration = 0

    print("\n### Starting grid search ###")
    
    # Iterate over the grid of thresholds
    for a in tqdm(correlation_thresholds, desc="Correlation Thresholds"):
        for b in tqdm(performance_thresholds, desc="Performance Thresholds", leave=False):
            print(f"\nGrid Combination - Correlation Threshold: {a}, Performance Threshold: {b}")
            
            # Run the rolling analysis
            try:
                print("Running rolling analysis for the current thresholds.")
                results_df = run_rolling_analysis(
                    df,
                    start_date='2014-06-01',
                    end_date='2022-12-31',
                    lookback_period=12,
                    a=a,
                    b=b
                )
                print(f"Rolling Analysis Completed. Results Shape: {results_df.shape}")
            except Exception as e:
                print(f"Error during rolling analysis for a={a}, b={b}: {e}")
                results_df = pd.DataFrame()  # Empty DataFrame in case of errors
            
            # Calculate mean performance for the iteration
            if not results_df.empty and 'mean_performance' in results_df.columns:
                mean_of_mean_performance = results_df['mean_performance'].mean()
                print(f"Mean of Mean Performance for Current Thresholds: {mean_of_mean_performance:.4f}")
            else:
                print("Column 'mean_performance' not found or results_df is empty. Assigning NaN.")
                mean_of_mean_performance = np.nan
            
            # Save the results DataFrame for the iteration
            iteration_key = f"iter_{iteration}_a_{a}_b_{b}"
            all_results[iteration_key] = results_df
            print(f"Iteration {iteration_key}: Results Saved.")
            
            # Append the iteration result
            iteration_results.append({
                'correlation_threshold': a,
                'performance_threshold': b,
                'mean_of_mean_performance': mean_of_mean_performance
            })
            
            # Increment iteration counter
            iteration += 1

    # Convert iteration results to a DataFrame for analysis
    print("\nConverting iteration results to a DataFrame for final analysis.")
    grid_performance_df = pd.DataFrame(iteration_results)
    print(f"Grid Performance DataFrame Shape: {grid_performance_df.shape}")
    print(f"Grid Performance DataFrame Head:\n{grid_performance_df.head()}")

    # Handle case where no valid data is available
    if grid_performance_df.empty:
        print("\nNo valid data available. Ensure the input data and thresholds are correct.")
    else:
        print("\nGrid Performance DataFrame successfully created.")

    print("\n### MAIN FUNCTION COMPLETED ###")
    return all_results, grid_performance_df


In [None]:
all_results, grid_performance_df = main(df)


### MAIN FUNCTION STARTED ###

Converting 'date' column to datetime format.

Defining thresholds grid for correlation and performance.
Correlation Thresholds: [0.   0.05 0.1  0.15 0.2  0.25 0.3  0.35 0.4  0.45 0.5  0.55 0.6  0.65
 0.7  0.75 0.8  0.85 0.9  0.95]
Performance Thresholds: [0.   0.05 0.1  0.15 0.2  0.25]

### Starting grid search ###


Correlation Thresholds:   0%|                                                                   | 0/20 [00:00<?, ?it/s]
Performance Thresholds:   0%|                                                                    | 0/6 [00:00<?, ?it/s][A


Grid Combination - Correlation Threshold: 0.0, Performance Threshold: 0.0
Running rolling analysis for the current thresholds.

### get_best_authors_by_sector ###
Start Date: 2013-05-31 00:00:00, End Date: 2014-05-31 00:00:00
Max Authors per Sector: 10, Correlation Threshold: 0.0, Performance Threshold: 0.0

Calculating monthly performance for: 2014-06
Monthly Performance for 2014-06:
            Sector               author  mean_performance  count      month
0  Consumer Goods  Chola Wealth Direct         -0.003993      1 2014-06-01

### get_best_authors_by_sector ###
Start Date: 2013-06-30 00:00:00, End Date: 2014-06-30 00:00:00
Max Authors per Sector: 10, Correlation Threshold: 0.0, Performance Threshold: 0.0

Calculating monthly performance for: 2014-07
Monthly Performance for 2014-07:
           Sector               author  mean_performance  count      month
0  Manufacturing  Chola Wealth Direct         -0.034603      3 2014-07-01
1     Technology  Chola Wealth Direct          0.0


Performance Thresholds:  17%|██████████                                                  | 1/6 [00:32<02:41, 32.32s/it][A


Calculating monthly performance for: 2022-12
Monthly Performance for 2022-12:
        Sector               author  mean_performance  count      month
0     Finance        Motilal Oswal          0.005214      9 2022-12-01
1  Healthcare  Prabhudas Lilladhar          0.009733      1 2022-12-01
2  Technology         ICICI Direct         -0.014867      3 2022-12-01
3  Technology         IDBI Capital          0.037146      1 2022-12-01
4  Technology        Motilal Oswal         -0.008008      3 2022-12-01
Rolling Analysis Completed. Results Shape: (1908, 5)
Mean of Mean Performance for Current Thresholds: 0.0022
Iteration iter_0_a_0.0_b_0.0: Results Saved.

Grid Combination - Correlation Threshold: 0.0, Performance Threshold: 0.05
Running rolling analysis for the current thresholds.

### get_best_authors_by_sector ###
Start Date: 2013-05-31 00:00:00, End Date: 2014-05-31 00:00:00
Max Authors per Sector: 10, Correlation Threshold: 0.0, Performance Threshold: 0.05

Calculating monthly perform


Performance Thresholds:  33%|████████████████████                                        | 2/6 [01:08<02:18, 34.61s/it][A


Calculating monthly performance for: 2022-12
Monthly Performance for 2022-12:
 Empty DataFrame
Columns: [Sector, author, mean_performance, count, month]
Index: []
Rolling Analysis Completed. Results Shape: (73, 5)
Mean of Mean Performance for Current Thresholds: 0.0044
Iteration iter_1_a_0.0_b_0.05: Results Saved.

Grid Combination - Correlation Threshold: 0.0, Performance Threshold: 0.1
Running rolling analysis for the current thresholds.

### get_best_authors_by_sector ###
Start Date: 2013-05-31 00:00:00, End Date: 2014-05-31 00:00:00
Max Authors per Sector: 10, Correlation Threshold: 0.0, Performance Threshold: 0.1
No authors met the thresholds.
No Best Authors Found for 2014-06

### get_best_authors_by_sector ###
Start Date: 2013-06-30 00:00:00, End Date: 2014-06-30 00:00:00
Max Authors per Sector: 10, Correlation Threshold: 0.0, Performance Threshold: 0.1
No authors met the thresholds.
No Best Authors Found for 2014-07

### get_best_authors_by_sector ###
Start Date: 2013-07-31 00

In [None]:
grid_performance_df

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def plot_heatmap(grid_performance_df):
    # Pivot the data to prepare for the heatmap
    heatmap_data = grid_performance_df.pivot(
        index='correlation_threshold', 
        columns='performance_threshold', 
        values='mean_of_mean_performance'
    )
    
    # Plot the heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(
        heatmap_data, 
        annot=True, 
        fmt=".4f", 
        cmap="coolwarm", 
        cbar_kws={'label': 'Average Performance'}
    )
    plt.title('Grid Search Heatmap of Performance')
    plt.xlabel('Performance Threshold')
    plt.ylabel('Correlation Threshold')
    plt.savefig("performance_heatmap.png")
    plt.show()

# Assuming grid_performance_df is already created
plot_heatmap(grid_performance_df)

In [None]:
results_df