In [1]:
import pandas as pd
from pathlib import Path

In [2]:
data_dir = Path('../data/external')
corporate_credit_dir = data_dir / 'corporate-credit-rating'

# List all CSV files in the lending-club directory
csv_files = list(corporate_credit_dir.glob('*.csv'))
print("Found CSV files:")
for file in csv_files:
    print(f"  - {file.name}")
    
# Check file sizes
print("\nFile sizes:")
for file in csv_files:
    size_mb = file.stat().st_size / (1024 * 1024)
    print(f"  - {file.name}: {size_mb:.1f} MB")

Found CSV files:
  - corporate_rating.csv

File sizes:
  - corporate_rating.csv: 0.7 MB


In [3]:
cc_file = corporate_credit_dir / 'corporate_rating.csv'

print("\nLoading Taiwan credit dataset...")
cc_df = pd.read_csv(cc_file)
print(f"Dataset shape: {cc_df.shape[0]:,} rows, {cc_df.shape[1]} columns")
print("Columns:", list(cc_df.columns))
print(f"Memory usage: {cc_df.memory_usage(deep=True).sum() / 1024 / 1024:.1f} MB")


Loading Taiwan credit dataset...
Dataset shape: 2,029 rows, 31 columns
Columns: ['Rating', 'Name', 'Symbol', 'Rating Agency Name', 'Date', 'Sector', 'currentRatio', 'quickRatio', 'cashRatio', 'daysOfSalesOutstanding', 'netProfitMargin', 'pretaxProfitMargin', 'grossProfitMargin', 'operatingProfitMargin', 'returnOnAssets', 'returnOnCapitalEmployed', 'returnOnEquity', 'assetTurnover', 'fixedAssetTurnover', 'debtEquityRatio', 'debtRatio', 'effectiveTaxRate', 'freeCashFlowOperatingCashFlowRatio', 'freeCashFlowPerShare', 'cashPerShare', 'companyEquityMultiplier', 'ebitPerRevenue', 'enterpriseValueMultiple', 'operatingCashFlowPerShare', 'operatingCashFlowSalesRatio', 'payablesTurnover']
Memory usage: 1.1 MB


In [4]:
len(cc_df)

2029

In [5]:
cc_df['Rating'].value_counts()

Rating
BBB    671
BB     490
A      398
B      302
AA      89
CCC     64
AAA      7
CC       5
C        2
D        1
Name: count, dtype: int64

In [6]:
len(cc_df[cc_df['Rating'] == 'D']) / len(cc_df)

0.0004928536224741252

In [None]:
for value in cc_df['Rating'].unique():
    print(f"{value}: {(len(cc_df[cc_df['Rating'] == value]) / len(cc_df)):.1%}")

A: 19.62%
BBB: 33.07%
AA: 4.39%
BB: 24.15%
B: 14.88%
CCC: 3.15%
D: 0.05%
CC: 0.25%
AAA: 0.34%
C: 0.10%
