In [24]:
import pandas as pd
from datetime import timedelta

In [25]:
# Load the filtered DataFrame from the CSV file
df = pd.read_csv('data/filtered_data_2.csv')

# Display the first few rows of the DataFrame to verify it loaded correctly
print(df.head())

         date                    Company Name      Symbol  \
0  2015-11-16  speciality restaurants limited  SPECIALITY   
1  2015-08-13  speciality restaurants limited  SPECIALITY   
2  2024-09-14           eicher motors limited   EICHERMOT   
3  2024-08-09           eicher motors limited   EICHERMOT   
4  2024-08-09           eicher motors limited   EICHERMOT   

                         author         Sector  expected_return  actual_return  
0                         Karvy     Technology         9.432421      -0.018033  
1           Reliance Securities     Technology         7.189542       0.018065  
2  Consensus Share Price Target  Manufacturing              NaN            NaN  
3                   Axis Direct  Manufacturing         4.748892            NaN  
4           Prabhudas Lilladhar  Manufacturing        10.441767            NaN  


In [26]:
# Helper function to get top authors by positive correlation
def get_top_authors_by_sector(df, start_date, end_date, top_n=10):
    # Filter data for the period
    period_data = df[(df['date'] >= start_date) & (df['date'] <= end_date)]

    # Calculate correlation for each author-sector pair within this period
    correlations = (
        period_data.groupby(['Sector', 'author'])[['expected_return', 'actual_return']]
        .corr()
        .iloc[0::2, 1]  # Gets correlation between expected_return and actual_return
        .reset_index(name='correlation')
    )

    # Filter for positive correlations only
    positive_correlations = correlations[correlations['correlation'] > 0]

    # Get top N authors per sector with highest positive correlations
    top_authors_per_sector = (
        positive_correlations.sort_values(by=['Sector', 'correlation'], ascending=[True, False])
        .groupby('Sector')
        .head(top_n)
    )
    
    return top_authors_per_sector[['Sector', 'author', 'correlation']]

In [27]:
# Step 2: Define the function to evaluate monthly predictions
def evaluate_monthly_performance(df, start_date, end_date, prediction_month):
    # Get top authors per sector for the historical period
    top_authors = get_top_authors_by_sector(df, start_date, end_date)

    # Filter data for predictions in the target month
    prediction_data = df[(df['date'].dt.to_period('M') == prediction_month)]

    # Calculate performance for each (author, sector) pair in the top authors list
    performance_data = []
    for _, row in prediction_data.iterrows():
        sector = row['Sector']
        author = row['author']

        # Check if author is in the top authors list for the sector
        if not ((top_authors['Sector'] == sector) & (top_authors['author'] == author)).any():
            continue  # Skip if the author-sector pair is not in the top authors list

        # Calculate performance (signed direction of expected_return * actual_return)
        performance = (row['expected_return'] * row['actual_return'])
        performance_data.append({
            'date': row['date'],
            'Sector': sector,
            'author': author,
            'performance': performance
        })

    return pd.DataFrame(performance_data)

In [28]:
# Step 3: Loop through each month and calculate performance
performance_results = []
df['date'] = pd.to_datetime(df['date'])
min_date = df['date'].min()
max_date = df['date'].max()

In [29]:
from tqdm import tqdm

# Calculate total number of months to iterate
total_months = ((max_date - (min_date + timedelta(days=365))).days) // 30

# Initialize current_date
current_date = min_date + timedelta(days=365)

# Loop from the earliest date (shifted by one year) to the last available month
for _ in tqdm(range(total_months), desc="Processing Months"):
    # Define the historical period (last 12 months)
    start_date = current_date - timedelta(days=365)
    end_date = current_date - timedelta(days=1)
    
    # Format current month for prediction data
    prediction_month = current_date.to_period('M')
    
    # Evaluate performance for this month
    monthly_performance = evaluate_monthly_performance(df, start_date, end_date, prediction_month)
    performance_results.append(monthly_performance)
    
    # Move to the next month
    current_date += timedelta(days=30)  # Approximate month increment


Processing Months: 100%|█████████████████████████████████████████████████████████████| 171/171 [00:33<00:00,  5.18it/s]


In [31]:
# Concatenate all results into a final DataFrame
final_performance_df = pd.concat(performance_results, ignore_index=True)

# Display or save results
print("Performance of analysts' predictions by (author, sector) for each month:")
print(final_performance_df)

Performance of analysts' predictions by (author, sector) for each month:
            date          Sector                    author  performance
0     2011-07-26         Finance       Chola Wealth Direct    -0.887305
1     2011-07-30         Finance       Chola Wealth Direct          NaN
2     2011-08-04         Finance       Chola Wealth Direct          NaN
3     2011-08-04         Finance       Chola Wealth Direct          NaN
4     2011-10-19         Finance       Chola Wealth Direct     1.842394
...          ...             ...                       ...          ...
13461 2024-07-18         Finance               Axis Direct    -0.189415
13462 2024-07-18         Finance  BOB Capital Markets Ltd.          NaN
13463 2024-07-18         Finance              IDBI Capital     0.155093
13464 2024-07-25  Consumer Goods               Anand Rathi     0.767967
13465 2024-07-25  Consumer Goods  ICICI Securities Limited     0.557215

[13466 rows x 4 columns]


In [33]:
# Remove rows with NaN values
cleaned_results = final_performance_df.dropna()

# Print cleaned results
print("Results without NaN values:")
print(cleaned_results)

Results without NaN values:
            date          Sector                    author  performance
0     2011-07-26         Finance       Chola Wealth Direct    -0.887305
4     2011-10-19         Finance       Chola Wealth Direct     1.842394
6     2012-01-24         Finance       Chola Wealth Direct     0.976929
9     2012-05-02         Finance       Chola Wealth Direct    -0.505296
11    2012-05-09         Finance       Chola Wealth Direct     0.933437
...          ...             ...                       ...          ...
13460 2024-07-18         Finance             Motilal Oswal    -0.683916
13461 2024-07-18         Finance               Axis Direct    -0.189415
13463 2024-07-18         Finance              IDBI Capital     0.155093
13464 2024-07-25  Consumer Goods               Anand Rathi     0.767967
13465 2024-07-25  Consumer Goods  ICICI Securities Limited     0.557215

[12048 rows x 4 columns]


In [34]:
# Check rows with NaN values
nan_rows = final_performance_df[final_performance_df.isna().any(axis=1)]
print("Rows with NaN values:")
print(nan_rows)

# Count NaN occurrences in each column
nan_counts = final_performance_df.isna().sum()
print("NaN counts per column:")
print(nan_counts)

Rows with NaN values:
            date      Sector                    author  performance
1     2011-07-30     Finance       Chola Wealth Direct          NaN
2     2011-08-04     Finance       Chola Wealth Direct          NaN
3     2011-08-04     Finance       Chola Wealth Direct          NaN
5     2011-11-18     Finance       Chola Wealth Direct          NaN
7     2012-02-01     Finance       Chola Wealth Direct          NaN
...          ...         ...                       ...          ...
13434 2024-07-02  Healthcare            SBI Securities          NaN
13435 2024-07-30  Technology       Prabhudas Lilladhar          NaN
13436 2024-07-30  Technology              ICICI Direct          NaN
13443 2024-07-31  Healthcare              ICICI Direct          NaN
13462 2024-07-18     Finance  BOB Capital Markets Ltd.          NaN

[1418 rows x 4 columns]
NaN counts per column:
date              0
Sector            0
author            0
performance    1418
dtype: int64
