In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.set_style("dark")
sns.set_palette("GnBu_d")
sns.despine()

In [None]:
df = pd.read_csv("../data/processed/numerical.csv")

In [None]:
df.shape

In [None]:
df.columns.tolist()

In [None]:
# checking which columns aren't null
df.columns[~df.isna().any()]

basically all the financial data has missing values somewhere that needs to be further analyzed

In [None]:
null_columns = df.columns[df.isna().any()]

In [None]:
df[null_columns].isna().sum()

In [None]:
# calculate the proportion of missing values per feature
missing_proportions = df[null_columns].isna().mean()
missing_proportions = missing_proportions.sort_values(ascending=False)
missing_proportions

In [None]:
len(missing_proportions[missing_proportions > 0.7].index)

In [None]:
# filter out features with more than 50% missing values
df_dropped = df.drop(missing_proportions[missing_proportions > 0.5].index, axis=1)

In [None]:
# 5% missing data
missing_proportions[missing_proportions <= 0.05].index

In [None]:
df_dropped.groupby('Ticker').count()

In [None]:
df_dropped

In [None]:
# backward fill the remaining missing values (temporary bandaid fix for all features, could be perma for some)
df_dropped = df_dropped.fillna(method='bfill', axis=0)

In [None]:
df_dropped[null_columns.intersection(df_dropped.columns)].isna().sum()[df_dropped[null_columns.intersection(df_dropped.columns)].isna().sum() > 0]

In [None]:
df_dropped.head()

# Visualizations

In [None]:
date_counts = df.groupby('Date').size().reset_index(name='count')

plt.figure(figsize=(12, 6))
colors = sns.color_palette("GnBu_d", len(date_counts))
sns.barplot(data=date_counts, x='Date', y='count', palette=colors, hue='Date', legend=False)

plt.xticks(rotation=90) 
plt.xlabel("Date")
plt.ylabel("Data Points")
plt.title("Data Count Per Record Date")
plt.tight_layout()
plt.show()

In [None]:
date_counts = df.groupby('ratingYear').size().reset_index(name='count')

plt.figure(figsize=(12, 6))
colors = sns.color_palette("GnBu_d", len(date_counts))
sns.barplot(data=date_counts, x='ratingYear', y='count', palette=colors, hue='ratingYear', legend=False)

plt.xticks(rotation=90) 
plt.xlabel("ratingYear")
plt.ylabel("Data Points")
plt.title("Data Count Per ESG Rating Year")
plt.tight_layout()
plt.show()

In [None]:
df.columns.tolist()

In [None]:
# important financial feature extracted based on domain knowledge
financial_features = [
    'EBITDA',
    'EBIT',
    'Total Expenses',
    'Diluted EPS',
    'Basic EPS',
    'Net Income',
    'Operating Income',
    'Operating Expense',
    'Gross Profit',
    'Cost Of Revenue',
    'Total Revenue',
    'Total Debt',
    'Net Debt',
    'Working Capital',
    'Total Assets',
    'Stockholders Equity',
    'Total Expenses',
    'Operating Cash Flow',
    'Free Cash Flow',
    'Capital Expenditure',
    'Research And Development',
    'Common Stock Dividend Paid',
    'Ordinary Shares Number',
    'Current Assets',
    'Current Liabilities',
]

In [None]:
base_features = [
    'Ticker',
    'Date',
    'totalEsg',
    'ratingDate',
    'environtmentScore',
    'socialScore',
    'governanceScore',
]

# EDA

## Feature Characteristics

### Checking data types & distributions

In [None]:
df.dtypes.value_counts()

In [None]:
categorical_features = df.select_dtypes(include=['object']).columns

In [None]:
categorical_features

In [None]:
df[financial_features].dtypes

In [None]:
feature = financial_features[0]
print(f"Feature: {feature}")
sns.histplot(df[feature], kde=True)
plt.title(f"Distribution of {feature}")
plt.xlim(0, df[feature].quantile(0.90)) # used to remove outliers
plt.show()

In [None]:
for feature in financial_features:
    print(f"\nFeature: {feature}")
    sns.histplot(df[feature], kde=True)
    plt.title(f"Distribution of {feature}")
    plt.xlim(0, df[feature].quantile(0.90))
    plt.show()

In [None]:
# trying out log transformations to handle skewedness
df["Basic_EPS_Log"] = np.log1p(df["Basic EPS"])  # log1p avoids log(0) errors

plt.figure(figsize=(8, 5))
sns.histplot(df["Basic_EPS_Log"], kde=True, bins=30)
plt.title("Log-Transformed Distribution of Basic EPS")
plt.xlabel("Log(Basic EPS)")
plt.ylabel("Frequency")
plt.show()

In [None]:
# trying out robust scaling to handle skewedness
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
df["Basic_EPS_Scaled"] = scaler.fit_transform(df[["Basic EPS"]])


In [None]:
plt.figure(figsize=(8, 5))
sns.histplot(df["Basic_EPS_Scaled"], kde=True, bins=30)
plt.title("Robust Scaled Distribution of Basic EPS")
plt.xlim(0, df["Basic_EPS_Scaled"].quantile(0.99))
plt.xlabel("Basic EPS (Robust Scaled)")
plt.ylabel("Frequency")
plt.show()

### Missing Values

## Assessing Feature Relevance

### Correlation Analysis

## Identifying Redundant Features

### Multicollinearity Check

## Feature Selection Techniques

## Feature Relationships

### Pair Plots & Heatmaps

## Feature Importance

***