# Factor IC Analysis

The **Information Coefficient (IC)** measures the cross-sectional correlation between a factor’s signal and subsequent returns. In factor research, IC quantifies predictive power on each rebalancing date and helps compare factor efficacy.

**Spearman rank correlation** is standard because it is robust to outliers and captures monotonic relationships without assuming linearity or normal distributions, which is common for financial cross sections.

In [9]:
# Load libraries
from pathlib import Path

import numpy as np
import pandas as pd
from scipy.stats import spearmanr

ROOT = Path.cwd().parents[0]

DATA_PATH = ROOT / "00_data" / "features" / "factors.parquet"

if not DATA_PATH.exists():
    raise FileNotFoundError(f"Data file not found: {DATA_PATH}")

df = pd.read_parquet(DATA_PATH)

print("Project root:", ROOT)
print("Data path:", DATA_PATH)
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())

# Identify key columns (defensive, but should match exactly)
date_col = "date"
ticker_col = "ticker"

df[date_col] = pd.to_datetime(df[date_col])

print("Date range:", df[date_col].min(), "→", df[date_col].max())
print("Number of tickers:", df[ticker_col].nunique())


Project root: /Users/pablodiazgonzalez/Documents/MachineLearning/EquityFactorResearch
Data path: /Users/pablodiazgonzalez/Documents/MachineLearning/EquityFactorResearch/00_data/features/factors.parquet
Shape: (70122, 10)
Columns: ['date', 'ticker', 'ret_1d', 'ret_fwd_5d', 'mom_20d', 'mom_60d', 'vol_20d', 'zscore_20d_price', 'adv_20d', 'dollar_vol_20d']
Date range: 2015-03-31 00:00:00 → 2025-12-17 00:00:00
Number of tickers: 26


In [10]:
# Compute daily IC time series per factor
factors = ['mom_20d', 'mom_60d', 'vol_20d', 'zscore_20d_price']
target = 'ret_fwd_5d'

ic_records = []

for date, group in df.groupby(date_col):
    row = {date_col: date}
    for factor in factors:
        sub = group[[factor, target]].dropna()
        # Require a minimum cross section to avoid noisy ICs
        if len(sub) >= 10:
            ic = spearmanr(sub[factor], sub[target]).correlation
        else:
            ic = np.nan
        row[factor] = ic
    ic_records.append(row)

ic_daily = pd.DataFrame(ic_records).set_index(date_col).sort_index()
ic_daily.head()

Unnamed: 0_level_0,mom_20d,mom_60d,vol_20d,zscore_20d_price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-03-31,-0.409231,0.070769,0.544615,-0.504957
2015-04-01,-0.419487,-0.010598,0.36547,-0.215043
2015-04-02,-0.534359,-0.090598,0.576068,-0.340171
2015-04-06,-0.411282,0.083761,0.553504,-0.323761
2015-04-07,0.085812,0.109744,0.207521,0.20547


In [11]:
# Aggregate IC statistics
summary = []

for factor in factors:
    series = ic_daily[factor].dropna()
    n_obs = series.shape[0]
    mean_ic = series.mean()
    std_ic = series.std(ddof=1)
    t_stat = mean_ic / (std_ic / np.sqrt(n_obs)) if n_obs > 1 and std_ic != 0 else np.nan
    summary.append({
        'factor': factor,
        'mean_ic': mean_ic,
        'std_ic': std_ic,
        't_stat': t_stat,
        'n_obs': n_obs,
    })

ic_overall = pd.DataFrame(summary)
ic_overall = ic_overall.reindex(ic_overall['mean_ic'].abs().sort_values(ascending=False).index)
ic_overall

Unnamed: 0,factor,mean_ic,std_ic,t_stat,n_obs
2,vol_20d,0.035428,0.341222,5.392061,2697
1,mom_60d,0.017453,0.306893,2.953345,2697
0,mom_20d,0.009467,0.295917,1.661393,2697
3,zscore_20d_price,-0.000451,0.2775,-0.084478,2697


In [12]:
# Temporal stability: monthly average ICs
ic_monthly = ic_daily.resample('ME').mean()
ic_monthly.index = ic_monthly.index.to_period('M').astype(str)
ic_monthly.head()

Unnamed: 0_level_0,mom_20d,mom_60d,vol_20d,zscore_20d_price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-03,-0.409231,0.070769,0.544615,-0.504957
2015-04,0.021929,-0.089068,0.105185,0.037851
2015-05,0.108308,0.179487,0.144615,0.012889
2015-06,0.249169,0.154654,-0.045688,0.254452
2015-07,0.151919,0.164693,-0.004103,0.059394


In [13]:
# Save outputs (PARQUET only)
output_overall = ROOT / '05_reports' / 'ic_overall.parquet'
output_monthly = ROOT / '05_reports' / 'ic_by_month.parquet'

ic_overall.to_parquet(output_overall, index=False)
ic_monthly.to_parquet(output_monthly)

print('Saved:', output_overall)
print('Saved:', output_monthly)

Saved: /Users/pablodiazgonzalez/Documents/MachineLearning/EquityFactorResearch/05_reports/ic_overall.parquet
Saved: /Users/pablodiazgonzalez/Documents/MachineLearning/EquityFactorResearch/05_reports/ic_by_month.parquet


## Interpretation

- **Strongest factor:** The factor with the largest absolute mean IC in the summary table above.
- **Weakest factor:** The factor with the smallest absolute mean IC in the summary table above.
- **Sign consistency & stability:** Review monthly ICs to see whether signs are consistent through time or if they drift/flip, indicating instability.
- **Limitations:** These ICs ignore transaction costs, implementation constraints, and portfolio construction effects; they measure only raw cross-sectional predictiveness.