In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.model_selection import cross_val_score, KFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, StandardScaler, RobustScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import VarianceThreshold, SelectKBest, mutual_info_regression, SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv("../data/raw/numerical.csv")

In [3]:
df.head()

Unnamed: 0,date,Tax Effect Of Unusual Items,Tax Rate For Calcs,Normalized EBITDA,Total Unusual Items,Total Unusual Items Excluding Goodwill,Net Income From Continuing Operation Net Minority Interest,Reconciled Depreciation,Reconciled Cost Of Revenue,EBITDA,...,militaryContract,nuclear,pesticides,palmOil,coal,tobacco,percentAumCovered,sustainRank,responsible,sustainScore
0,2024-12-31,0.0,0.237,1690800000.0,,,1150600000.0,175400000.0,4144100000.0,1690800000.0,...,False,False,False,False,False,False,,,,
1,2023-12-31,0.0,0.241,1710100000.0,,,1155000000.0,177300000.0,3992200000.0,1710100000.0,...,False,False,False,False,False,False,,,,
2,2022-12-31,0.0,0.245,1630900000.0,,,1086900000.0,176600000.0,3764800000.0,1630900000.0,...,False,False,False,False,False,False,,,,
3,2021-12-31,0.0,0.234,1388200000.0,,,925000000.0,170700000.0,3233700000.0,1388200000.0,...,False,False,False,False,False,False,,,,
4,2020-12-31,,,,1400000.0,1400000.0,,,,,...,False,False,False,False,False,False,,,,


In [4]:
front_columns = ['symbol', 'date', 'companySize', 'region', 'totalEsg', 'shortName', 'longName', 'marketCap', 'cumulativeMarketCapPercentage', 'ratingYear', 'ratingMonth']

all_columns = df.columns.tolist()
new_column_order = front_columns + [col for col in all_columns if col not in front_columns] 

# Reindex the DataFrame with the new column order
df = df[new_column_order]


In [5]:
df.head()

Unnamed: 0,symbol,date,companySize,region,totalEsg,shortName,longName,marketCap,cumulativeMarketCapPercentage,ratingYear,...,militaryContract,nuclear,pesticides,palmOil,coal,tobacco,percentAumCovered,sustainRank,responsible,sustainScore
0,FAST,2024-12-31,Mid-Cap,north_america,25.04,Fastenal Company,Fastenal Company,43679760000.0,70.076539,2025,...,False,False,False,False,False,False,,,,
1,FAST,2023-12-31,Mid-Cap,north_america,25.04,Fastenal Company,Fastenal Company,43679760000.0,70.076539,2025,...,False,False,False,False,False,False,,,,
2,FAST,2022-12-31,Mid-Cap,north_america,25.04,Fastenal Company,Fastenal Company,43679760000.0,70.076539,2025,...,False,False,False,False,False,False,,,,
3,FAST,2021-12-31,Mid-Cap,north_america,25.04,Fastenal Company,Fastenal Company,43679760000.0,70.076539,2025,...,False,False,False,False,False,False,,,,
4,FAST,2020-12-31,Mid-Cap,north_america,25.04,Fastenal Company,Fastenal Company,43679760000.0,70.076539,2025,...,False,False,False,False,False,False,,,,


In [6]:
df.shape

(4646, 392)

In [7]:
df['symbol'].nunique()

1012

In [8]:
financial_features = [
    'EBITDA',
    'EBIT',
    'Total Expenses',
    'Diluted EPS',
    'Basic EPS',
    'Net Income',
    'Operating Income',
    'Operating Expense',
    'Gross Profit',
    'Cost Of Revenue',
    'Total Revenue',
    'Total Debt',
    'Net Debt',
    'Working Capital',
    'Total Assets',
    'Stockholders Equity',
    'Total Expenses',
    'Operating Cash Flow',
    'Free Cash Flow',
    'Capital Expenditure',
    'Research And Development',
    'Common Stock Dividend Paid',
    'Ordinary Shares Number',
    'Current Assets',
    'Current Liabilities',
]

In [9]:
df.groupby('symbol')[financial_features].mean().reset_index()

Unnamed: 0,symbol,EBITDA,EBIT,Total Expenses,Diluted EPS,Basic EPS,Net Income,Operating Income,Operating Expense,Gross Profit,...,Stockholders Equity,Total Expenses.1,Operating Cash Flow,Free Cash Flow,Capital Expenditure,Research And Development,Common Stock Dividend Paid,Ordinary Shares Number,Current Assets,Current Liabilities
0,000270.KS,1.257615e+13,1.018991e+13,8.177509e+13,18107.50000,18107.50000,7.179952e+12,9.144556e+12,1.044240e+13,1.958696e+13,...,4.415798e+13,8.177509e+13,1.013844e+13,7.185500e+12,-2.952938e+12,1.381538e+12,,3.974759e+08,3.565410e+13,2.489798e+13
1,0005.HK,,,,0.66750,0.67000,1.924700e+10,,3.516700e+10,,...,1.865962e+11,,5.702075e+10,5.313200e+10,-3.888750e+09,,-1.066225e+10,1.944447e+10,,
2,000660.KS,2.079492e+13,9.253642e+12,3.585934e+13,2733.00000,2734.75000,6.696097e+12,9.503738e+12,7.210090e+12,1.671383e+13,...,6.346129e+13,3.585934e+13,1.699140e+13,2.737830e+12,-1.425357e+13,3.935340e+12,,6.881123e+08,2.891765e+13,1.893558e+13
3,005380.KS,1.880374e+13,1.409453e+13,1.379466e+14,35204.50000,35204.50000,9.198782e+12,1.146759e+13,1.839141e+13,2.985900e+13,...,8.973403e+13,1.379466e+14,3.176235e+11,-7.385793e+12,-7.703416e+12,1.957906e+12,,2.176674e+08,5.865108e+13,7.283625e+13
4,005490.KS,8.868070e+12,5.183215e+12,6.888582e+13,41762.50000,42833.25000,3.260918e+12,4.998186e+12,2.657448e+12,7.655635e+12,...,5.032426e+13,6.888582e+13,6.824890e+12,1.910905e+12,-4.913986e+12,1.485615e+11,-1.000980e+12,7.584161e+07,4.469990e+13,2.109632e+13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1007,ZED.JO,1.880000e+08,1.880000e+08,8.000000e+06,0.44675,0.48375,7.752500e+08,6.000000e+06,-1.500000e+06,8.500000e+06,...,5.415750e+09,8.000000e+06,3.675000e+07,3.675000e+07,,,-1.508000e+09,1.539077e+09,1.617250e+09,8.300000e+07
1008,ZEEL.BO,1.241600e+10,9.494000e+09,7.031175e+10,5.08500,5.08500,4.884750e+09,1.126500e+10,2.706125e+10,3.832625e+10,...,1.063802e+11,7.031175e+10,6.677500e+09,4.577000e+09,-2.100500e+09,,-1.393250e+09,9.605148e+08,1.104150e+11,2.413875e+10
1009,ZFSVF,,7.288000e+09,5.672250e+10,32.76000,33.03500,4.832750e+09,,,,...,2.628400e+10,5.672250e+10,5.789250e+09,5.314500e+09,-4.747500e+08,,-3.971000e+09,1.455248e+08,,
1010,ZTS.MX,3.505750e+09,3.030500e+09,5.366000e+09,4.82500,4.83750,2.245250e+09,3.048000e+09,2.856750e+09,5.904750e+09,...,4.678750e+09,5.366000e+09,2.357750e+09,1.745250e+09,-6.125000e+08,5.867500e+08,-6.407500e+08,4.608056e+08,6.691250e+09,2.566250e+09


In [10]:
# Create a latest_date feature to align different EOY financial dates
df['latest_date'] = df.groupby('symbol')['date'].transform('max')

In [12]:
df['latest_date'].value_counts()

latest_date
2024-12-31    2603
2024-03-31     925
2023-12-31     444
2024-06-30     180
2024-09-30     156
2025-01-31      74
2024-02-29      58
2024-08-31      44
2024-07-31      34
2024-10-31      33
2024-01-31      28
2024-05-31      25
2024-04-30      15
2023-03-31      11
2022-12-31      11
2024-11-30       5
Name: count, dtype: int64

In [18]:
df['latest_year'] = df['latest_date'].apply(lambda x: x.split('-')[0])

In [None]:
sentiments = pd.read_csv("../data/processed/news_sentiment.csv")
sentiments.head()

In [None]:
sentiments['symbol'].nunique()

In [None]:
filtered_df = latest_entries[latest_entries['symbol'].isin(sentiments['symbol'])]
filtered_df[financial_features].head()
filtered_df.columns.tolist()

## Data Cleaning

### Assessing Missingness Patterns

In [19]:
df.shape

(4646, 394)

In [20]:
# Drop features with more than 70% missing values, as they offer too little information to be imputed
pct_null = df.isnull().mean()
df = df.drop(columns=pct_null[pct_null > 0.7].index.tolist())
df.shape

(4646, 233)

### Assessing constant features

Features with zero or very low variance provide little information and can be removed.

In [21]:
# Calculate variance for each numerical column
feature_variance = df.var(numeric_only=True)
feature_variance.sort_values(ascending=True).head(10)

maxAge                             0.000000
Tax Rate For Calcs                 0.006051
ratingYear                         0.513186
highestControversy                 1.181631
ratingMonth                        4.228918
governanceScore                    8.557575
socialScore                       12.425297
environmentScore                  33.442564
totalEsg                          69.034408
cumulativeMarketCapPercentage    368.737453
dtype: float64

In [22]:
vt = VarianceThreshold(threshold=0.1)
vt.fit(df.select_dtypes(include=[np.number]).fillna(0))
constant_feats = df.select_dtypes(include=[np.number]).columns[~vt.get_support()]
constant_feats

Index(['Tax Rate For Calcs', 'maxAge'], dtype='object')

In [23]:
df = df.drop(columns=constant_feats)

In [24]:
df.shape

(4646, 231)

### Missing Value Imputation

Step 1: Company‑level ffill/bfill (and/or rolling)

Step 2: Peer‑group imputation (region + company size + year)

Step 3: Global/statistical imputer (median) for any stragglers

In [25]:
# defining columns that should be imputed
df_sorted = df.sort_values(by=['symbol', 'ratingYear'], ascending=True)
meta_numeric = ['date', 'latest_date', 'latest_year', 'ratingYear', 'ratingMonth', 'totalEsg', 'environmentScore', 'socialScore', 'governanceScore']

cols_to_impute = df_sorted.select_dtypes(include=np.number).columns.tolist()
# remove identifiers/targets if they are numeric and shouldn't be imputed this way
cols_to_impute = [col for col in cols_to_impute if col not in meta_numeric]

In [None]:
# Step 1: company-level historical imputation
df_filled = df_sorted.copy()
df_filled[cols_to_impute] = df_filled.groupby('symbol')[cols_to_impute].ffill()
df_filled[cols_to_impute] = df_filled.groupby('symbol')[cols_to_impute].bfill()

In [27]:
len(df_filled[cols_to_impute].columns[df_filled[cols_to_impute].isnull().any()].tolist())

161

In [None]:
# Step 2
cols_to_impute_final = [
    col for col in df_filled.select_dtypes(include=np.number).columns.tolist()
    if col not in meta_numeric
    and df_filled[col].isnull().any() # Only consider columns with NaNs remaining
]

In [34]:
df_filled.groupby(['region', 'companySize', 'latest_year']).count().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,symbol,date,totalEsg,shortName,longName,marketCap,cumulativeMarketCapPercentage,ratingYear,ratingMonth,Tax Effect Of Unusual Items,...,furLeather,gambling,gmo,militaryContract,nuclear,pesticides,palmOil,coal,tobacco,latest_date
region,companySize,latest_year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
east_asia_n_pacific,Large-Cap,2022,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
east_asia_n_pacific,Large-Cap,2023,48,48,48,48,48,48,48,48,48,48,...,48,48,48,48,48,48,48,48,48,48
east_asia_n_pacific,Large-Cap,2024,204,204,204,204,204,204,204,204,204,204,...,204,204,204,204,204,204,204,204,204,204
east_asia_n_pacific,Mid-Cap,2023,104,104,104,104,104,104,104,104,104,104,...,104,104,104,104,104,104,104,104,104,104
east_asia_n_pacific,Mid-Cap,2024,332,332,332,332,332,332,332,332,332,332,...,332,332,332,332,332,332,332,332,332,332


In [35]:
for col in cols_to_impute_final:
    # Use transform with median. It aligns results back to the original DataFrame index.
    df_filled[col] = df_filled.groupby(['region', 'companySize', 'latest_year'])[col].transform(lambda x: x.fillna(x.median()))

In [38]:
remaining_nan_counts = df_filled[cols_to_impute_final].isnull().sum()
cols_still_nan = remaining_nan_counts[remaining_nan_counts > 0].index.tolist()

In [42]:
remaining_nan_counts[remaining_nan_counts > 0].sort_values(ascending=True).head(10)

Other Properties                        2
Net Business Purchase And Sale          2
Net Investment Purchase And Sale        2
Goodwill And Other Intangible Assets    3
Operating Cash Flow                     3
Other Non Cash Items                    3
Change In Payable                       3
Change In Working Capital               3
Depreciation And Amortization           3
Other Intangible Assets                 3
dtype: int64

In [43]:
# Step 3
for col in cols_still_nan:
    global_median_val = df_filled[col].median()
    df_filled[col] = df_filled[col].fillna(global_median_val)

    df_filled[cols_to_impute_final].isnull().sum().sort_values(ascending=True).head(10)

In [47]:
df_filled.sort_index()

Unnamed: 0,symbol,date,companySize,region,totalEsg,shortName,longName,marketCap,cumulativeMarketCapPercentage,ratingYear,...,gambling,gmo,militaryContract,nuclear,pesticides,palmOil,coal,tobacco,latest_date,latest_year
0,FAST,2024-12-31,Mid-Cap,north_america,25.04,Fastenal Company,Fastenal Company,4.367976e+10,70.076539,2025,...,False,False,False,False,False,False,False,False,2024-12-31,2024
1,FAST,2023-12-31,Mid-Cap,north_america,25.04,Fastenal Company,Fastenal Company,4.367976e+10,70.076539,2025,...,False,False,False,False,False,False,False,False,2024-12-31,2024
2,FAST,2022-12-31,Mid-Cap,north_america,25.04,Fastenal Company,Fastenal Company,4.367976e+10,70.076539,2025,...,False,False,False,False,False,False,False,False,2024-12-31,2024
3,FAST,2021-12-31,Mid-Cap,north_america,25.04,Fastenal Company,Fastenal Company,4.367976e+10,70.076539,2025,...,False,False,False,False,False,False,False,False,2024-12-31,2024
4,FAST,2020-12-31,Mid-Cap,north_america,25.04,Fastenal Company,Fastenal Company,4.367976e+10,70.076539,2025,...,False,False,False,False,False,False,False,False,2024-12-31,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4641,8411.T,2024-03-31,Mid-Cap,east_asia_n_pacific,24.36,MIZUHO FINANCIAL GROUP,"Mizuho Financial Group, Inc.",1.064007e+13,79.513237,2025,...,False,False,False,False,False,False,False,False,2024-03-31,2024
4642,8411.T,2023-03-31,Mid-Cap,east_asia_n_pacific,24.36,MIZUHO FINANCIAL GROUP,"Mizuho Financial Group, Inc.",1.064007e+13,79.513237,2025,...,False,False,False,False,False,False,False,False,2024-03-31,2024
4643,8411.T,2022-03-31,Mid-Cap,east_asia_n_pacific,24.36,MIZUHO FINANCIAL GROUP,"Mizuho Financial Group, Inc.",1.064007e+13,79.513237,2025,...,False,False,False,False,False,False,False,False,2024-03-31,2024
4644,8411.T,2021-03-31,Mid-Cap,east_asia_n_pacific,24.36,MIZUHO FINANCIAL GROUP,"Mizuho Financial Group, Inc.",1.064007e+13,79.513237,2025,...,False,False,False,False,False,False,False,False,2024-03-31,2024


In [48]:
df = df_filled

## Feature Engineering

### Feature Creation

Creating different financial ratios

In [None]:
# Defining the ratios to be calculated and their required features
ratios_1 = {
    'profit_margin': (['Net Income From Continuing And Discontinued Operation'], 'Operating Revenue'),
    'roa': (['Net Income From Continuing And Discontinued Operation'], 'Total Assets'),
    'roe': (['Net Income From Continuing And Discontinued Operation'], 'Total Equity Gross Minority Interest'),
    'debt_to_equity': (['Total Debt'], 'Total Equity Gross Minority Interest'),
    'interest_coverage': (['EBIT'], 'Interest Expense'),
    'current_ratio': (['Current Assets'], 'Current Liabilities'),
    'quick_ratio': (['Current Assets', 'Inventory'], 'Current Liabilities'),
}

ratios_2 = {
    'sales_to_assets': ['Total Revenue', 'Total Assets'],
    'EBIT_to_sales': ['EBIT', 'Total Revenue'],
    'dividend_yield': ['Cash Dividends Paid', 'marketCap'],
    'net_income_to_sales': ['Net Income', 'Total Revenue'],
    'price_to_earnings': ['marketCap', 'Ordinary Shares Number', 'Diluted EPS'],
    'liquidity_ratio': ['Current Assets', 'Current Liabilities'],
    'solvency_ratio': ['Total Debt', 'Total Assets']
}

In [None]:
# Filter the data to include only the latest entries for each symbol
df_latest = df[df['date'] == df['latest_date']]
df_latest = df_latest.drop(columns=['latest_date'])

In [None]:
# Create the first set of financial ratios
for name, (num_cols, den_col) in ratios_1.items():
    if den_col in df_latest.columns and all(c in df_latest.columns for c in num_cols):
        print(f"Calculating {name}...")
        numerator = df_latest[num_cols].sum(axis=1)
        df_latest[name] = numerator / df_latest[den_col]

  df[name] = numerator / df[den_col]
  df[name] = numerator / df[den_col]
  df[name] = numerator / df[den_col]
  df[name] = numerator / df[den_col]
  df[name] = numerator / df[den_col]


In [51]:
df

Unnamed: 0,symbol,date,companySize,region,totalEsg,shortName,longName,marketCap,cumulativeMarketCapPercentage,ratingYear,...,palmOil,coal,tobacco,latest_date,latest_year,profit_margin,roa,interest_coverage,current_ratio,quick_ratio
4570,000270.KS,2024-12-31,Large-Cap,east_asia_n_pacific,24.55,KIA CORP.,Kia Corporation,3.772784e+13,62.003060,2023,...,False,False,False,2024-12-31,2024,0.090955,0.105362,134.021323,1.549352,2.009704
4571,000270.KS,2023-12-31,Large-Cap,east_asia_n_pacific,24.55,KIA CORP.,Kia Corporation,3.772784e+13,62.003060,2023,...,False,False,False,2024-12-31,2024,0.087938,0.108858,70.675644,1.459303,1.898379
4572,000270.KS,2022-12-31,Large-Cap,east_asia_n_pacific,24.55,KIA CORP.,Kia Corporation,3.772784e+13,62.003060,2023,...,False,False,False,2024-12-31,2024,0.062494,0.073387,33.199744,1.345552,1.704284
4573,000270.KS,2021-12-31,Large-Cap,east_asia_n_pacific,24.55,KIA CORP.,Kia Corporation,3.772784e+13,62.003060,2023,...,False,False,False,2024-12-31,2024,0.068140,0.071211,38.633721,1.354449,1.683151
3894,0005.HK,2024-12-31,Small-Cap,east_asia_n_pacific,24.22,HSBC HOLDINGS,HSBC HOLDINGS,1.586861e+12,91.787839,2025,...,False,False,False,2024-12-31,2024,0.354687,0.007948,1.363844,1.653200,1.940697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
329,ZYDUSLIFE.NS,2024-03-31,Large-Cap,south_asia,38.30,ZYDUS LIFESCIENCES LTD,Zydus Lifesciences Limited,8.919223e+11,62.998521,2023,...,False,False,False,2024-03-31,2024,0.202902,0.131810,110.293182,2.153054,2.797375
330,ZYDUSLIFE.NS,2023-03-31,Large-Cap,south_asia,38.30,ZYDUS LIFESCIENCES LTD,Zydus Lifesciences Limited,8.919223e+11,62.998521,2023,...,False,False,False,2024-03-31,2024,0.116147,0.076109,21.635060,1.811055,2.428210
331,ZYDUSLIFE.NS,2022-03-31,Large-Cap,south_asia,38.30,ZYDUS LIFESCIENCES LTD,Zydus Lifesciences Limited,8.919223e+11,62.998521,2023,...,False,False,False,2024-03-31,2024,0.302632,0.161440,25.808566,1.564074,2.038523
332,ZYDUSLIFE.NS,2021-03-31,Large-Cap,south_asia,38.30,ZYDUS LIFESCIENCES LTD,Zydus Lifesciences Limited,8.919223e+11,62.998521,2023,...,False,False,False,2024-03-31,2024,0.151051,0.089329,16.389352,1.108722,1.520385
