In [2]:
import polars as pl
import numpy as np

In [3]:
df = pl.scan_parquet('./Datasets/crsp')
df.head().collect_schema()

Schema([('PERMNO', Int64),
        ('date', String),
        ('NAMEENDT', String),
        ('SHRCD', Int64),
        ('EXCHCD', Int64),
        ('NCUSIP', String),
        ('TICKER', String),
        ('COMNAM', String),
        ('SHRCLS', String),
        ('PRIMEXCH', String),
        ('TRDSTAT', String),
        ('SECSTAT', String),
        ('PERMCO', Int64),
        ('CUSIP', String),
        ('DCLRDT', String),
        ('DLAMT', Float64),
        ('DLPDT', String),
        ('DLSTCD', Float64),
        ('PAYDT', String),
        ('RCRDDT', String),
        ('SHRFLG', Float64),
        ('DISTCD', Float64),
        ('DIVAMT', Float64),
        ('FACPR', Float64),
        ('FACSHR', Float64),
        ('ACPERM', Float64),
        ('ACCOMP', Float64),
        ('SHRENDDT', String),
        ('NWPERM', Float64),
        ('DLRETX', String),
        ('DLPRC', Float64),
        ('DLRET', String),
        ('BIDLO', Float64),
        ('ASKHI', Float64),
        ('PRC', Float64),
        ('VOL', F

In [4]:
# efficient daily log returns for Apple
aapl_permno = df.filter(pl.col('TICKER') == 'AAPL') \
    .select('PERMNO') \
    .unique() \
    .collect() \
    .item()

log_returns = df.filter(pl.col('PERMNO') == aapl_permno) \
    .select(['PERMNO', 'PRC']) \
    .with_columns(
        (pl.col('PRC')/pl.col('PRC').shift(1)) \
            .log()
            .alias('LOG_RET')
        ) \
    .collect()

df_crsp = df

In [42]:
# compustat rolling sum
df_cstat = pl.scan_parquet('./Datasets/compustat/fundamentals_quarterly_all')
cstat_clean = df_cstat.select(['gvkey', 'datadate', 'gsector', 'tic', 'cusip', 'conm', 'cik', 'fyearq', 'fqtr', 
                 'rdq', 'indfmt', 'datafmt', 'consol', 'curcdq', 'costat', 'prccq', 'cshoq',
                 'ajexq', 'mkvaltq', 'atq', 'actq', 'cheq', 'rectq', 'invtq', 'ppentq',
                 'ltq', 'lctq', 'dlcq', 'dlttq', 'apq', 'txdbq','seqq', 'ceqq', 'pstkq',
                 'saleq', 'cogsq', 'xsgaq', 'xrdq', 'dpq', 'xintq','oiadpq', 'piq', 'txtq', 
                 'niq', 'ibq','epsfxq', 'oancfy', 'capxy', 'dvy', 'aqcy', 'epspxq', 'opepsq'])
cstat_clean.filter(pl.col('gsector').is_not_null()).tail().collect()

gvkey,datadate,gsector,tic,cusip,conm,cik,fyearq,fqtr,rdq,indfmt,datafmt,consol,curcdq,costat,prccq,cshoq,ajexq,mkvaltq,atq,actq,cheq,rectq,invtq,ppentq,ltq,lctq,dlcq,dlttq,apq,txdbq,seqq,ceqq,pstkq,saleq,cogsq,xsgaq,xrdq,dpq,xintq,oiadpq,piq,txtq,niq,ibq,epsfxq,oancfy,capxy,dvy,aqcy,epspxq,opepsq
str,date,str,str,str,str,str,i32,i16,date,str,str,str,str,str,"decimal[24,12]","decimal[18,4]","decimal[24,12]","decimal[18,4]","decimal[18,4]","decimal[18,4]","decimal[18,4]","decimal[18,4]","decimal[18,4]","decimal[18,4]","decimal[18,4]","decimal[18,4]","decimal[18,4]","decimal[18,4]","decimal[18,4]","decimal[18,4]","decimal[18,4]","decimal[18,4]","decimal[18,4]","decimal[18,4]","decimal[18,4]","decimal[18,4]","decimal[18,4]","decimal[18,4]","decimal[18,4]","decimal[18,4]","decimal[18,4]","decimal[18,4]","decimal[18,4]","decimal[18,4]","decimal[18,4]","decimal[18,4]","decimal[18,4]","decimal[18,4]","decimal[18,4]","decimal[18,4]","decimal[18,4]"
"""266160""",2025-12-31,"""40""","""CZWI""","""174903104""","""CITIZENS COMMUNITY BANCORP""","""0001367859""",2025,4,2026-01-26,"""INDL""","""STD""","""C""","""USD""","""A""",17.82,9.617,1.0,171.3749,1781.755,,118.853,1317.924,5.811,16.357,1593.816,,,,1524.099,0.0,187.939,187.939,0.0,24.463,8.906,7.155,,,,8.402,4.885,0.614,4.271,4.271,0.44,,,,,0.44,0.44
"""266214""",2025-12-31,"""40""","""CNS""","""19247A100""","""COHEN & STEERS INC""","""0001284812""",2025,4,2026-01-22,"""INDL""","""STD""","""C""","""USD""","""A""",62.78,51.004,1.0,3202.0311,,,,,,,,,,,,,562.0,562.0,0.0,143.803,100.958,,,2.535,0.0,40.31,42.288,11.585,34.879,34.879,0.68,,,,,0.68,0.68
"""294524""",2025-12-31,"""15""","""LYB""","""N53745100""","""LYONDELLBASELL INDUSTRIES NV""","""0001489393""",2025,4,2026-01-30,"""INDL""","""STD""","""C""","""USD""","""A""",43.3,322.0,1.0,13942.6,34003.0,10868.0,3449.0,2517.0,3533.0,17347.0,23796.0,6129.0,814.0,13451.0,2694.0,2316.0,10082.0,10082.0,0.0,7091.0,6335.0,397.0,33.0,385.0,132.0,-26.0,-142.0,-8.0,-142.0,-136.0,-0.43,2262.0,1878.0,1764.0,,-0.43,-0.26
"""317264""",2025-12-31,"""10""","""LPG""","""Y2106R110""","""DORIAN LPG LTD""","""0001596993""",2025,3,2026-02-05,"""INDL""","""STD""","""C""","""USD""","""A""",24.34,42.744,1.0,1040.389,1777.739,392.286,294.492,73.35,2.251,1328.03,692.11,162.462,145.559,528.068,6.691,0.0,1085.629,1085.629,0.0,120.65,42.317,10.78,,16.242,6.83,51.311,47.189,0.0,47.189,47.189,1.11,127.996,29.665,75.249,0.0,1.11,1.11
"""326688""",2025-12-31,"""20""","""NVT""","""G6700G107""","""NVENT ELECTRIC PLC""","""0001720635""",2025,4,2026-02-06,"""INDL""","""STD""","""C""","""USD""","""A""",101.97,161.36,1.0,16453.8792,6851.9,1639.6,237.5,693.0,,434.5,3121.7,1003.5,13.8,1546.0,358.9,232.0,3730.2,3730.2,0.0,1066.7,677.8,219.7,21.3,,19.1,169.2,156.3,40.5,118.8,115.8,0.71,465.2,93.3,130.4,975.7,0.72,0.74


In [None]:
# earnings TTM (trailing twelve month) rolling sum: cont.
result = cstat_clean.select(['tic', 'gvkey', 'epspxq']) \
    .with_columns(
        pl.col("epspxq")
        .cast(pl.Float64)
        .rolling_sum(window_size=4)
        .over('gvkey')
        .alias('TTM')
    )

result.collect()

tic,gvkey,epspxq,TTM
str,str,"decimal[18,4]",f64
"""ADCT.1""","""001013""",0.0200,
"""SERV.1""","""001082""",-0.2000,
"""AIM.1""","""001173""",0.1700,
"""IDAI.""","""001183""",-0.1900,
"""ABS.1""","""001240""",0.3500,
…,…,…,…
"""FEX""","""177422""",,
"""PIN""","""179400""",,
"""UTG""","""260849""",,
"""SCD""","""260850""",,
