In [1]:
import polars as pl

In [4]:
import glob

queries = []
for file in glob.glob('data/*.csv'):
    splitted_file_name = file.split('-')
    month = int(splitted_file_name[1].split('.')[0]) if len(splitted_file_name) > 1 else 1
    q = pl.scan_csv(file).with_columns(pl.lit(month).alias('file date'))
    queries.append(q)

df = pl.concat(pl.collect_all(queries))
df.filter(pl.col('id')==588).head()

id,first_name,last_name,Ticker,Sector,Market,Stock Name,Market Cap,Purchase Price,file date
i64,str,str,str,str,str,str,str,str,i32
588,"""Caterina""","""McHardy""","""BSAC""","""Finance""","""NYSE""","""Banco Santande…","""$11.96B""","""$24647.37""",1
588,"""Job""","""Burchett""","""TOPS""","""Transportation…","""NASDAQ""","""TOP Ships Inc.…","""$392729.7""","""$18062.27""",11
588,"""Garfield""","""Cookney""","""SYX""","""Consumer Servi…","""NYSE""","""Systemax Inc.""","""$685.51M""","""$91588.34""",3
588,"""Ephrem""","""Ricards""","""GJV""","""Finance""","""NYSE""","""Synthetic Fixe…","""n/a""","""$43250.18""",2
588,"""Tilda""","""De Paoli""","""JNPR""","""Technology""","""NYSE""","""Juniper Networ…","""$10.95B""","""$83211.49""",10


In [8]:
def clean_market_cap() -> pl.Expr:
    '''
    Convert symbols 'B' and 'M' to zeros while removing '$' symbol
    '''
    conversion = {'M': 1_000_000, 'B': 1_000_000_000}
    col_expr = pl.col('Market Cap').str.replace('$', '', literal=True)
    mutiplier_expr = col_expr.str.slice(-1).replace(conversion).cast(pl.Int64)

    cleaned = (
        pl.when(col_expr.str.contains('B')).then(col_expr.str.replace('B', '', literal=True))
        .when(col_expr.str.contains('M')).then(col_expr.str.replace('M', '', literal=True))
        .otherwise(col_expr)
        .cast(pl.Float64) * mutiplier_expr
    ).cast(pl.Int64)

    return cleaned

(
    df
    .filter(pl.col('Market Cap') != 'n/a')
    .with_columns(
        pl.col('Purchase Price').str.replace('$', '', literal=True).cast(pl.Float64),
        clean_market_cap() 
    )
    .with_columns(
        pl.when(pl.col('Purchase Price') < 25_000).then(pl.lit('Low'))
        .when(pl.col('Purchase Price') < 50_000).then(pl.lit('Medium'))
        .when(pl.col('Purchase Price') < 75_000).then(pl.lit('High'))
        .when(pl.col('Purchase Price') < 100_000).then(pl.lit('Very High'))
        .alias('Purchase Price Category'),
        pl.when(pl.col('Market Cap') < 100_000_000).then(pl.lit('Small'))
        .when(pl.col('Market Cap') < 1_000_000_000).then(pl.lit('Medium'))
        .when(pl.col('Market Cap') < 100_000_000_000).then(pl.lit('Large'))
        .when(pl.col('Market Cap') >= 100_000_000_000).then(pl.lit('Huge'))
        .alias('Market Cap Category')
    )
    .with_columns(
        pl.col('Purchase Price')
        .rank('ordinal', descending=True).over('file date', 'Purchase Price Category', 'Market Cap Category')
        .alias('Rank')
    )
    .filter(pl.col('Rank')<=5)
)

id,first_name,last_name,Ticker,Sector,Market,Stock Name,Market Cap,Purchase Price,file date,Purchase Price Category,Market Cap Category,Rank
i64,str,str,str,str,str,str,i64,f64,i32,str,str,u32
14,"""Erminie""","""Lis""","""JHD""","""n/a""","""NYSE""","""Nuveen High In…",277100000,24418.39,1,"""Low""","""Medium""",2
22,"""Davin""","""Rusling""","""LINK""","""Technology""","""NASDAQ""","""Interlink Elec…",60660000,23502.42,1,"""Low""","""Small""",4
54,"""Jany""","""Hancke""","""CSOD""","""Technology""","""NASDAQ""","""Cornerstone On…",2089999999,74079.42,1,"""High""","""Large""",5
107,"""Chico""","""De Maria""","""CBA …","""n/a""","""NYSE""","""ClearBridge Am…",508590000,48507.84,1,"""Medium""","""Medium""",5
120,"""Nealson""","""Hosburn""","""VSTM""","""Health Care""","""NASDAQ""","""Verastem, Inc.…",74720000,74416.07,1,"""High""","""Small""",3
125,"""Margret""","""Harry""","""SHOP""","""Technology""","""NYSE""","""Shopify Inc.""",7350000000,49796.86,1,"""Medium""","""Large""",1
138,"""Luciano""","""Girauld""","""PANL""","""Transportation…","""NASDAQ""","""Pangaea Logist…",99120000,24071.47,1,"""Low""","""Small""",1
148,"""Mickie""","""Brack""","""AR""","""Energy""","""NYSE""","""Antero Resourc…",7130000000,49504.15,1,"""Medium""","""Large""",3
154,"""Sherline""","""Vowels""","""IZEA""","""Consumer Servi…","""NASDAQ""","""IZEA Inc.""",12380000,48496.19,1,"""Medium""","""Small""",1
161,"""Rudiger""","""Mathieson""","""FLKS""","""Health Care""","""NASDAQ""","""Flex Pharma, I…",64330000,96663.47,1,"""Very High""","""Small""",3
