In [130]:
import numpy as np
import pandas as pd
import math
import os
import re
from openpyxl import load_workbook

from sklearn.impute import KNNImputer

In [131]:
# df1 = pd.read_excel("data/Healthcare.xlsx", sheet_name="TT")
# df2 = pd.read_excel("data/Healthcare.xlsx", sheet_name="TL")

# df2.drop(columns=['Stock Name', 'BSE code', 'ISIN', 'Current Price', 'Industry Name'], inplace=True)
# df2.rename(columns={'NSE code': 'Ticker'}, inplace=True)

# df = pd.merge(df1, df2, on='Ticker', how='inner')

df = pd.read_excel('Sector_Universe_Data.xlsx', sheet_name='Energy')

In [132]:
def categorize_market_cap(row):
    if row['Market Cap'] > 82000:
        return 'Large'
    elif 26000 < row['Market Cap'] <= 82000:
        return 'Medium'
    else:
        return 'Small'

df['Market Cap'] = df.apply(categorize_market_cap, axis=1)

## Data Cleaning

In [133]:
df.isna().sum().sort_values(ascending=False)

Insider Trades - 3M Cumulative    33
1Y Forward EPS Growth             27
1Y Forward Revenue Growth         27
Dividend Yield                    21
Bulk Deals - 3M Cumulative        19
EV / Free Cash Flow               16
Price / CFO                        6
Cash Conversion Cycle              5
EV/EBITDA Ratio                    5
Inventory Turnover Ratio           4
Return on Equity                   3
Interest Coverage Ratio            3
Return on Investment               2
EV / Invested Capital              2
Altman Zscore                      2
1Y Historical Revenue Growth       1
Promoter Holding Change – 3M       1
MF Holding Change – 3M             1
FII Holding Change – 3M            1
DII Holding Change – 3M            1
1Y Historical EPS Growth           0
Piotroski Score                    0
Name                               0
EV / Revenue Ratio                 0
Ticker                             0
PB Ratio                           0
PE Ratio                           0
C

In [134]:
df.columns = df.columns.str.strip()

In [135]:
def clean_column_name(column_name):
    # Keep only letters, numbers, spaces, and '/'
    cleaned_name = re.sub(r'[^a-zA-Z0-9 / -]', '', column_name)
    return cleaned_name

# Applying the cleaning function to column names
df.columns = [clean_column_name(col) for col in df.columns]

In [136]:
columns_to_fill_zero = [
    'Insider Trades - 3M Cumulative',
    'Bulk Deals - 3M Cumulative',
    'FII Holding Change3M',
    'DII Holding Change3M',
    'MF Holding Change3M',
    'Promoter Holding Change3M',
    'Dividend Yield'
]

df[columns_to_fill_zero].fillna(0, inplace=True)

for col in df.columns:
    if df[col].isna().sum() >= 0.25 * len(df):
        df[col].fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[columns_to_fill_zero].fillna(0, inplace=True)


In [137]:
# Segmenting the DataFrame
small_df = df[df['Market Cap'] == 'Small']
medium_df = df[df['Market Cap'] == 'Medium']
large_df = df[df['Market Cap'] == 'Large']


def apply_knn_imputation(segment_df):
    # excluding non-numeric columns
    numeric_cols = segment_df.select_dtypes(include=[np.number]).columns
    
    imputer = KNNImputer(n_neighbors=5)
    segment_df[numeric_cols] = imputer.fit_transform(segment_df[numeric_cols])
    
    return segment_df

# Apply KNN Imputation to each segment
small_df_imputed = apply_knn_imputation(small_df)
medium_df_imputed = apply_knn_imputation(medium_df)
large_df_imputed = apply_knn_imputation(large_df)

df = pd.concat([small_df_imputed, medium_df_imputed, large_df_imputed])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  segment_df[numeric_cols] = imputer.fit_transform(segment_df[numeric_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  segment_df[numeric_cols] = imputer.fit_transform(segment_df[numeric_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  segment_df[numeric_cols] = imputer.fit_transform(segmen

In [104]:
def create_inverse(df, column):
    
    # Calculate inverse of specified column, handling zeros
    inverse_column = 1 / df[column].replace(0, np.nan)
    max_value = inverse_column.max()
    inverse_column = inverse_column.apply(lambda x: abs(x) + max_value if x < 0 else x)
    inverse_column.fillna(max_value + 1, inplace=True)
    
    df[column] = inverse_column
    return df


## Feature Engineering

In [105]:
df['Altman Zscore'] = df['Altman Zscore'] - 3

df['Fundamentals'] = df['Piotroski Score'] * df['Altman Zscore']

df.drop(columns=['Piotroski Score', 'Altman Zscore'], inplace=True)

In [106]:
investor_cols = ['FII Holding Change3M', 'DII Holding Change3M', 'MF Holding Change3M', 'Promoter Holding Change3M', 'Insider Trades - 3M Cumulative', 'Bulk Deals - 3M Cumulative']

df['Investor Sentiment'] = df[investor_cols].mean(axis=1)

insider_cols = ['Insider Trades - 3M Cumulative', 'Promoter Holding Change3M']

df['Promoter Sentiment'] = df[insider_cols].mean(axis=1)

# drop the original columns
df.drop(columns=investor_cols+insider_cols, inplace=True)

In [107]:
price_cols = ['Price / Sales', 'Price / CFO']

df['Pricing'] = df[price_cols].mean(axis=1)*df['PE Ratio']*df['PB Ratio']

# drop the original columns
df.drop(columns=price_cols+['PE Ratio', 'PB Ratio'], inplace=True)

In [108]:
valuation_cols = ['EV / Invested Capital','EV / Revenue Ratio', 'EV / Free Cash Flow', 'EV/EBITDA Ratio']

df['Valuation'] = df[valuation_cols].mean(axis=1)

# drop the original columns
df.drop(columns=valuation_cols, inplace=True)

In [109]:
# Assuming 'df' is your DataFrame
profitability_cols = ['Return on Investment', 'ROCE', 'Return on Equity', 'Return on Assets', 'Dividend Yield']

# Create a new column 'Profitability' as the mean of the selected columns
df['Profitability'] = df[profitability_cols].mean(axis=1)*df['Net Profit Margin']

# drop the original columns
df.drop(columns=profitability_cols+['Net Profit Margin'], inplace=True)

In [110]:
df = create_inverse(df, 'Cash Conversion Cycle')

turnover_cols = ['Interest Coverage Ratio', 'Asset Turnover Ratio', 'Inventory Turnover Ratio', 'Working Capital Turnover Ratio']

# Create a new column 'Financial Health' as the mean of the selected columns
df['Business Turnover'] = df[turnover_cols].mean(axis=1)*df['Current Ratio']*df['Cash Conversion Cycle']

# drop the original columns
df.drop(columns=turnover_cols +['Current Ratio', 'Cash Conversion Cycle'], inplace=True)

In [111]:
future_eps = df['1Y Forward EPS Growth'] - df['1Y Historical EPS Growth']
future_rev = df['1Y Forward Revenue Growth'] - df['1Y Historical Revenue Growth']

df['Future Growth'] = future_eps + future_rev

df.drop(columns=['1Y Forward EPS Growth', '1Y Historical EPS Growth', '1Y Forward Revenue Growth', '1Y Historical Revenue Growth'], inplace=True)

In [112]:
# Define weights for each rank
weights = {
    'Rank1': 0.2,
    'Rank2': 0.15,
    'Rank3': 0.2,
    'Rank4': 0.1,
    'Rank5': 0.1,
    'Rank6': 0.05,
    'Rank7': 0.1,
    'Rank8': 0.1
}

# Group by 'Market Cap' categories
grouped = df.groupby('Market Cap')

# Initialize an empty list to store the results
result_dfs = []

# Iterate over each group
for name, group in grouped:
    # Calculate ranks within each group
    group['Rank1'] = group['Fundamentals'].rank(method='min', ascending=False)
    group['Rank2'] = group['Investor Sentiment'].rank(method='min', ascending=False)
    group['Rank3'] = group['Promoter Sentiment'].rank(method='min', ascending=False)
    group['Rank4'] = group['Pricing'].rank(method='min', ascending=True)
    group['Rank5'] = group['Valuation'].rank(method='min', ascending=True)
    group['Rank6'] = group['Profitability'].rank(method='min', ascending=False)
    group['Rank7'] = group['Business Turnover'].rank(method='min', ascending=False)
    group['Rank8'] = group['Future Growth'].rank(method='min', ascending=False)
    
    # Calculate weighted rank
    group['Weighted_rank'] = group[list(weights.keys())].mul(list(weights.values())).sum(axis=1)
    
    # Calculate final rank
    group['Rank'] = group['Weighted_rank'].rank(method='min', ascending=True)
    
    # Reorder columns
    columns_order = ['Rank', 'Name', 'Ticker', 'Sub-Sector', 'Market Cap', 'Fundamentals', 'Investor Sentiment', 'Promoter Sentiment', 'Pricing', 'Valuation', 'Profitability', 'Business Turnover', 'Future Growth']
    group = group[columns_order]
    
    # Append the result to the list
    result_dfs.append(group)

# Concatenate the results
df = pd.concat(result_dfs)

## Final Model

In [117]:
df = df.sort_values(by='Rank', ascending=True)
df

Unnamed: 0,Rank,Name,Ticker,Sub-Sector,Market Cap,Fundamentals,Investor Sentiment,Promoter Sentiment,Pricing,Valuation,Profitability,Business Turnover,Future Growth
1,1.0,DLF Ltd,DLF,Real Estate,Large,44.0,0.042063,0.0,37684.312833,59.790962,138.01727,0.000985,7.709333
2,2.0,Macrotech Developers Ltd,LODHA,Real Estate,Large,3.5,-0.093486,-1.128313,57654.480257,45.890393,16.021454,0.000651,442.001503


In [94]:
# File path to your Excel file
excel_file_path = 'Sector Analysis.xlsx'

# Load the existing Excel workbook
wb = load_workbook(excel_file_path)

# Create a new sheet called 'Healthcare'
ws = wb.create_sheet(title='Healthcare')

# Group the data by Market Cap and save them in different dataframes
grouped = df.groupby('Market Cap')

# Initialize row counter
row_counter = 1

# Loop through each group
for name, group in grouped:
    # Write the headers starting from the fourth row
    ws.cell(row=row_counter + 1, column=1)
    ws.append(group.columns.tolist())  # Write headers
    
    # Write the data
    for row in group.itertuples(index=False):
        ws.append(list(row))
    
    # Add space of 4 rows between each dataframe
    row_counter = ws.max_row + 1

# Save the workbook
wb.save(excel_file_path)

print("Data has been saved to the 'Healthcare' sheet in the Excel file.")

Data has been saved to the 'Healthcare' sheet in the Excel file.
