In [2]:
import polars as pl
from pathlib import Path

In [4]:
data_dir = Path('../data/raw')

# List all CSV files in the data directory
csv_files = list(data_dir.glob('*.csv'))
print("Found CSV files:")
for file in csv_files:
    size_mb = file.stat().st_size / (1024 * 1024)
    print(f"  - {file.name}: size = {size_mb:.1f} MB")

Found CSV files:
  - taiwan_credit.csv: size = 2.7 MB
  - lending_club.csv: size = 1597.5 MB
  - corporate_credit_rating.csv: size = 0.7 MB


In [6]:
cc_file = data_dir / 'corporate_credit_rating.csv'

print("\nLoading Corporate Credit Rating dataset...")
cc_df = pl.read_csv(cc_file)
print(f"Dataset shape: {cc_df.shape[0]:,} rows, {cc_df.shape[1]} columns")
print("Columns:", list(cc_df.columns))
print(f"Memory usage: {cc_df.estimated_size() / 1024 / 1024:.1f} MB")


Loading Corporate Credit Rating dataset...
Dataset shape: 2,029 rows, 31 columns
Columns: ['Rating', 'Name', 'Symbol', 'Rating Agency Name', 'Date', 'Sector', 'currentRatio', 'quickRatio', 'cashRatio', 'daysOfSalesOutstanding', 'netProfitMargin', 'pretaxProfitMargin', 'grossProfitMargin', 'operatingProfitMargin', 'returnOnAssets', 'returnOnCapitalEmployed', 'returnOnEquity', 'assetTurnover', 'fixedAssetTurnover', 'debtEquityRatio', 'debtRatio', 'effectiveTaxRate', 'freeCashFlowOperatingCashFlowRatio', 'freeCashFlowPerShare', 'cashPerShare', 'companyEquityMultiplier', 'ebitPerRevenue', 'enterpriseValueMultiple', 'operatingCashFlowPerShare', 'operatingCashFlowSalesRatio', 'payablesTurnover']
Memory usage: 0.5 MB


In [7]:
len(cc_df)

2029

In [8]:
cc_df['Rating'].value_counts()

Rating,count
str,u32
"""CCC""",64
"""BB""",490
"""CC""",5
"""AA""",89
"""C""",2
"""AAA""",7
"""D""",1
"""A""",398
"""BBB""",671
"""B""",302


In [10]:
cc_df.filter(pl.col("Rating") == "D").height / cc_df.height

0.0004928536224741252

In [12]:
total = cc_df.height

rating_stats = (
    cc_df.group_by("Rating")
         .agg(pl.count().alias("count"))
         .with_columns((pl.col("count") / total).alias("share"))
         .sort("Rating")
)

for rating, _, share in rating_stats.iter_rows():
    print(f"{rating}: {share:.1%}")

A: 19.6%
AA: 4.4%
AAA: 0.3%
B: 14.9%
BB: 24.1%
BBB: 33.1%
C: 0.1%
CC: 0.2%
CCC: 3.2%
D: 0.0%


(Deprecated in version 0.20.5)
  .agg(pl.count().alias("count"))


In [13]:
cc_df.collect_schema()

Schema([('Rating', String),
        ('Name', String),
        ('Symbol', String),
        ('Rating Agency Name', String),
        ('Date', String),
        ('Sector', String),
        ('currentRatio', Float64),
        ('quickRatio', Float64),
        ('cashRatio', Float64),
        ('daysOfSalesOutstanding', Float64),
        ('netProfitMargin', Float64),
        ('pretaxProfitMargin', Float64),
        ('grossProfitMargin', Float64),
        ('operatingProfitMargin', Float64),
        ('returnOnAssets', Float64),
        ('returnOnCapitalEmployed', Float64),
        ('returnOnEquity', Float64),
        ('assetTurnover', Float64),
        ('fixedAssetTurnover', Float64),
        ('debtEquityRatio', Float64),
        ('debtRatio', Float64),
        ('effectiveTaxRate', Float64),
        ('freeCashFlowOperatingCashFlowRatio', Float64),
        ('freeCashFlowPerShare', Float64),
        ('cashPerShare', Float64),
        ('companyEquityMultiplier', Float64),
        ('ebitPerRevenue', Fl