In [198]:
from glob import glob
from pathlib import Path
import polars as pl
import re

# cwd
CWD = Path.cwd()
input_dir = CWD.joinpath('input')

# fetching all data sources
data_sets = glob(f'{input_dir}/*.csv')

# loading the data, creating a column date for each file, create the base for market cap and appending all files together
df = ( pl.concat([pl.read_csv(file)
                    .with_column((pl.lit('2023-') 
                      + pl.lit(file.rsplit('/', 1)[-1][10:-4]).str.replace(r'^$','1')
                      + pl.lit('-01')).str.strptime(pl.Date, '%Y-%m-%d')
                      .alias('month'))
                    .filter(pl.col('Market Cap') != 'n/a')
                    .with_columns([pl.col('Market Cap').str.replace_all(r"\d+", "",False)
                                            .str.replace('.', '', True)
                                            .alias('cap_scale'),
                        pl.col('Market Cap').str.replace_all(r".$|^\$", "", False)
                                            .cast(pl.Float64)
                                            .round(2)
                                            .alias('market_cap'),
                        pl.col('Purchase Price').str.replace(r'$','', True).cast(pl.Float64).alias('purchase_price')

                        ])
      for file in data_sets] 
     ))


#df.select(pl.col('cap_scale').value_counts())

df = ( df.with_columns([ # create a multiplyer
                        pl.when(pl.col('cap_scale') == '$B').then(1_000_000_000)
                        .when(pl.col('cap_scale') == '$M').then(1_000_000)
                        .when(pl.col('cap_scale') == '$').then(1)
                        .alias('multiply')
                    ])
                    .with_columns([(pl.col('market_cap') * pl.col('multiply')).alias('market_cap')])
                    .drop(['multiply', 'Market Cap', 'cap_scale', 'Purchase Price'])
                    .with_columns([ # create groups for purchase price
                                    pl.when(pl.col('purchase_price') < 25_000).then('Low')
                                      .when(pl.col('purchase_price') < 50_000).then('Medium')
                                      .when(pl.col('purchase_price') < 75_000).then('High')
                                      .otherwise('Very High')
                                    .cast(pl.Categorical).alias('purchase_price_category'),
                                    # create groups for market cap
                                    pl.when(pl.col('market_cap') < 100_000_000).then('Small')
                                      .when(pl.col('market_cap') < 1_000_000_000).then('Medium')
                                      .when(pl.col('market_cap') < 100_000_000_000).then('Large')
                                      .otherwise('Huge')
                                    .cast(pl.Categorical).alias('market_cap_category')
                    ])          # window function to calculate rank of purchase price
                    .with_column(pl.col('purchase_price')
                                   .rank('dense', reverse=True)
                                   .over(['month', 'market_cap_category', 'purchase_price_category'])
                                   .alias('purchase_price_rank'))
                    .filter(pl.col('purchase_price_rank') <= 5)
    )


# cleaning after final computations, dropping unnecessary cols, renaming
df = df.drop(['id', 'first_name', 'last_name']) \
       .rename({'month': 'file_date'}) \
       .rename({col: col.lower().replace(' ', '_') for col in df.columns})


ticker,sector,market,stock_name,file_date,market_cap,purchase_price,purchase_price_category,market_cap_category,purchase_price_rank
str,str,str,str,date,f64,f64,cat,cat,u32
"""JHD""","""n/a""","""NYSE""","""Nuveen High In...",2023-01-01,2.771e8,24418.39,"""Low""","""Medium""",2
"""LINK""","""Technology""","""NASDAQ""","""Interlink Elec...",2023-01-01,6.066e7,23502.42,"""Low""","""Small""",4
"""CSOD""","""Technology""","""NASDAQ""","""Cornerstone On...",2023-01-01,2.0900e9,74079.42,"""High""","""Large""",5
"""CBA ...","""n/a""","""NYSE""","""ClearBridge Am...",2023-01-01,5.0859e8,48507.84,"""Medium""","""Medium""",5
"""VSTM""","""Health Care""","""NASDAQ""","""Verastem, Inc....",2023-01-01,7.472e7,74416.07,"""High""","""Small""",3
"""SHOP""","""Technology""","""NYSE""","""Shopify Inc.""",2023-01-01,7.3500e9,49796.86,"""Medium""","""Large""",1
"""PANL""","""Transportation...","""NASDAQ""","""Pangaea Logist...",2023-01-01,9.912e7,24071.47,"""Low""","""Small""",1
"""AR""","""Energy""","""NYSE""","""Antero Resourc...",2023-01-01,7.1300e9,49504.15,"""Medium""","""Large""",3
"""IZEA""","""Consumer Servi...","""NASDAQ""","""IZEA Inc.""",2023-01-01,1.238e7,48496.19,"""Medium""","""Small""",1
"""FLKS""","""Health Care""","""NASDAQ""","""Flex Pharma, I...",2023-01-01,6.433e7,96663.47,"""Very High""","""Small""",3


##### ouptut the data into a csv file

In [205]:
output_dir = CWD.joinpath('output')

if not output_dir.exists():
    output_dir.mkdir(parents=False, exist_ok=True)
df.write_csv(f'{output_dir}/py-solution.csv', sep=',', has_header=True, date_format='%Y-%m-%d')