In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
# Define data paths
data_dir = Path('../data/external')
lending_club_dir = data_dir / 'lending-club'

# List all CSV files in the lending-club directory
csv_files = list(lending_club_dir.glob('*.csv'))
print("Found CSV files:")
for file in csv_files:
    print(f"  - {file.name}")
    
# Check file sizes
print("\nFile sizes:")
for file in csv_files:
    size_mb = file.stat().st_size / (1024 * 1024)
    print(f"  - {file.name}: {size_mb:.1f} MB")

Found CSV files:
  - accepted_2007_to_2018Q4.csv
  - rejected_2007_to_2018Q4.csv

File sizes:
  - accepted_2007_to_2018Q4.csv: 1597.5 MB
  - rejected_2007_to_2018Q4.csv: 1699.7 MB


In [3]:
# Load the accepted loans dataset
print("Loading accepted loans dataset...")
accepted_file = lending_club_dir / 'accepted_2007_to_2018Q4.csv'

# Load the full dataset (this might take a while for large files)
print("\nLoading full accepted dataset...")
accepted_df = pd.read_csv(accepted_file, usecols=['loan_status'])
print(f"Full dataset shape: {accepted_df.shape}")
print(f"Memory usage: {accepted_df.memory_usage(deep=True).sum() / 1024 / 1024:.1f} MB")

Loading accepted loans dataset...

Loading full accepted dataset...
Full dataset shape: (2260701, 1)
Memory usage: 125.3 MB


# Dataset Summary Analysis

This notebook provides a comprehensive summary of the Lending Club datasets found in the data/external folder.

In [4]:
# Detailed statistical summary of accepted loans
print("=== ACCEPTED LOANS DATASET SUMMARY ===")
print(f"Dataset shape: {accepted_df.shape[0]:,} rows, {accepted_df.shape[1]} columns")
print(f"Date range: {accepted_df['issue_d'].min()} to {accepted_df['issue_d'].max()}" if 'issue_d' in accepted_df.columns else "No issue_d column found")

print("\n--- Data Types ---")
print(accepted_df.dtypes.value_counts())

print("\n--- Missing Values ---")
missing_values = accepted_df.isnull().sum()
missing_pct = (missing_values / len(accepted_df)) * 100
missing_summary = pd.DataFrame({
    'Missing_Count': missing_values,
    'Missing_Percentage': missing_pct
}).sort_values('Missing_Count', ascending=False)
print(missing_summary[missing_summary['Missing_Count'] > 0].head(10))

print("\n--- Numerical Columns Summary ---")
numeric_cols = accepted_df.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 0:
    print(accepted_df[numeric_cols].describe())

=== ACCEPTED LOANS DATASET SUMMARY ===
Dataset shape: 2,260,701 rows, 1 columns
No issue_d column found

--- Data Types ---
object    1
Name: count, dtype: int64

--- Missing Values ---
             Missing_Count  Missing_Percentage
loan_status             33             0.00146

--- Numerical Columns Summary ---


In [5]:
accepted_df['loan_status'].unique()

array(['Fully Paid', 'Current', 'Charged Off', 'In Grace Period',
       'Late (31-120 days)', 'Late (16-30 days)', 'Default', nan,
       'Does not meet the credit policy. Status:Fully Paid',
       'Does not meet the credit policy. Status:Charged Off'],
      dtype=object)

In [14]:
ls_df = accepted_df[accepted_df['loan_status'].isin(['Charged Off', 'Fully Paid', 'Current'])].copy()['loan_status']

In [16]:
for value in ls_df.unique():
    print(f"{value}: {(len(ls_df[ls_df == value]) / len(ls_df)):.1%}")

Fully Paid: 48.4%
Current: 39.5%
Charged Off: 12.1%
