In [1]:
import polars as pl
from polars import datatypes
from pathlib import Path

In [2]:
data_dir = Path('../data/raw')

# List all CSV files in the data directory
csv_files = list(data_dir.glob('*.csv'))
print("Found CSV files:")
for file in csv_files:
    size_mb = file.stat().st_size / (1024 * 1024)
    print(f"  - {file.name}: size = {size_mb:.1f} MB")

Found CSV files:
  - taiwan_credit.csv: size = 2.7 MB
  - lending_club.csv: size = 1597.5 MB
  - corporate_credit_rating.csv: size = 0.7 MB


In [None]:
taiwan_credit_file = data_dir / 'taiwan_credit.csv'

print("\nLoading Taiwan credit dataset...")
taiwan_credit_df = pl.read_csv(taiwan_credit_file, infer_schema_length=None)
print(f"Dataset shape: {taiwan_credit_df.shape[0]:,} rows, {taiwan_credit_df.shape[1]} columns")
print("Columns:", list(taiwan_credit_df.columns))
print(f"Memory usage: {taiwan_credit_df.estimated_size() / 1024 / 1024:.1f} MB")


Loading Taiwan credit dataset...
Dataset shape: 30,000 rows, 25 columns
Columns: ['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'default.payment.next.month']
Memory usage: 5.7 MB


In [14]:
taiwan_credit_df.collect_schema()

Schema([('ID', Int64),
        ('LIMIT_BAL', Float64),
        ('SEX', Int64),
        ('EDUCATION', Int64),
        ('MARRIAGE', Int64),
        ('AGE', Int64),
        ('PAY_0', Int64),
        ('PAY_2', Int64),
        ('PAY_3', Int64),
        ('PAY_4', Int64),
        ('PAY_5', Int64),
        ('PAY_6', Int64),
        ('BILL_AMT1', Float64),
        ('BILL_AMT2', Float64),
        ('BILL_AMT3', Float64),
        ('BILL_AMT4', Float64),
        ('BILL_AMT5', Float64),
        ('BILL_AMT6', Float64),
        ('PAY_AMT1', Float64),
        ('PAY_AMT2', Float64),
        ('PAY_AMT3', Float64),
        ('PAY_AMT4', Float64),
        ('PAY_AMT5', Float64),
        ('PAY_AMT6', Float64),
        ('default.payment.next.month', Int64)])

In [13]:
default_rate = taiwan_credit_df.filter(pl.col("default.payment.next.month") == 1).height / taiwan_credit_df.height
default_rate_percentage = default_rate * 100
print(f"\nDefault rate: {default_rate_percentage:.2f}%")


Default rate: 22.12%
