In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

# Load and preprocess the transaction data
def preprocess_data(transactions):
    # Convert timestamps
    transactions['timestamp'] = transactions['timestamp'].apply(lambda x: datetime.fromtimestamp(int(x)))

    # Convert amounts to numeric values
    for col in ['amount', 'amountUSD']:
        transactions[col] = pd.to_numeric(transactions[col])

    # Classify transaction types
    transactions['type'] = transactions.apply(classify_transaction, axis=1)

    return transactions

def classify_transaction(row):
    if 'deposits' in row['id']:
        return 'deposit'
    elif 'withdraws' in row['id']:
        return 'withdraw'
    elif 'borrows' in row['id']:
        return 'borrow'
    elif 'repays' in row['id']:
        return 'repay'
    elif 'liquidates' in row['id']:
        return 'liquidate'
    return 'other'

In [2]:
def calculate_features(wallet_transactions):
    features = {}

    # Historical credit risk
    liquidations = wallet_transactions[wallet_transactions['type'] == 'liquidate']
    repays = wallet_transactions[wallet_transactions['type'] == 'repay']

    features['liquidation_count'] = len(liquidations)
    features['repay_count'] = len(repays)
    features['liquidation_ratio'] = (features['liquidation_count'] /
                                   (features['liquidation_count'] + features['repay_count'] + 1e-6))

    # Time-weighted liquidation risk
    if not liquidations.empty:
        current_time = wallet_transactions['timestamp'].max()
        time_weights = [(current_time - t).days for t in liquidations['timestamp']]
        recency_weights = [1/(1 + np.exp(-(dt - 30))) for dt in time_weights]  # 30-day midpoint
        features['time_weighted_risk'] = np.sum(recency_weights)

    # Current risk profile
    current_borrows = wallet_transactions[wallet_transactions['type'] == 'borrow']
    current_deposits = wallet_transactions[wallet_transactions['type'] == 'deposit']

    features['current_borrowed'] = current_borrows['amountUSD'].sum()
    features['current_collateral'] = current_deposits['amountUSD'].sum()
    features['health_ratio'] = (features['current_borrowed'] /
                              (features['current_collateral'] + 1e-6))

    # Credit utilization (simplified)
    borrows = wallet_transactions[wallet_transactions['type'] == 'borrow']
    deposits = wallet_transactions[wallet_transactions['type'] == 'deposit']

    if not borrows.empty and not deposits.empty:
        features['avg_utilization'] = (borrows['amountUSD'].mean() /
                                     deposits['amountUSD'].mean())

    # Transaction behavior
    features['tx_count'] = len(wallet_transactions)
    features['tx_volume'] = wallet_transactions['amountUSD'].sum()
    features['credit_debit_ratio'] = (
        len(wallet_transactions[wallet_transactions['amount'] > 0]) /
        (len(wallet_transactions[wallet_transactions['amount'] < 0]) + 1e-6))

    # New credit behavior
    if len(borrows) > 1:
        time_diffs = borrows['timestamp'].sort_values().diff().dt.days.dropna()
        features['avg_time_btwn_loans'] = time_diffs.mean()
        features['loan_frequency'] = 1 / features['avg_time_btwn_loans']

    return features

In [3]:
class CreditScorer:
    def __init__(self):
        # Initialize weights based on research paper
        self.weights = {
            'historical_risk': 0.35,
            'current_risk': 0.25,
            'credit_utilization': 0.15,
            'transaction_behavior': 0.15,
            'new_credit': 0.10
        }

        # Initialize thresholds based on empirical data
        self.thresholds = {
            'liquidation_ratio': 0.2,  # >20% is risky
            'health_ratio': 0.8,       # >80% is risky
            'avg_utilization': 0.7,    # >70% is risky
            'tx_count': 10,            # Minimum for active wallet
            'loan_frequency': 1/7      # More than 1 loan/week is risky
        }

    def calculate_subscores(self, features):
        subscores = {}

        # Historical risk subscore (lower is better)
        subscores['historical_risk'] = min(
            1, features.get('liquidation_ratio', 0) +
            0.5 * features.get('time_weighted_risk', 0))

        # Current risk subscore (lower is better)
        subscores['current_risk'] = min(1, features.get('health_ratio', 0))

        # Credit utilization subscore (higher is better)
        utilization = features.get('avg_utilization', 0)
        subscores['credit_utilization'] = max(0, 1 - (utilization / self.thresholds['avg_utilization']))

        # Transaction behavior subscore (higher is better)
        tx_score = (np.log1p(features.get('tx_count', 0)) / 5 +  # Normalize count
                   np.log1p(features.get('tx_volume', 0)) / 15 + # Normalize volume
                   min(1, features.get('credit_debit_ratio', 0))) / 3
        subscores['transaction_behavior'] = tx_score

        # New credit subscore (lower is better)
        freq = features.get('loan_frequency', 0)
        subscores['new_credit'] = min(1, freq / self.thresholds['loan_frequency'])

        return subscores

    def calculate_score(self, features):
        subscores = self.calculate_subscores(features)

        # Calculate weighted score (0-1 scale)
        weighted_score = (
            self.weights['historical_risk'] * (1 - subscores['historical_risk']) +
            self.weights['current_risk'] * (1 - subscores['current_risk']) +
            self.weights['credit_utilization'] * subscores['credit_utilization'] +
            self.weights['transaction_behavior'] * subscores['transaction_behavior'] +
            self.weights['new_credit'] * (1 - subscores['new_credit'])
        )

        # Convert to 0-100 scale
        final_score = min(100, max(0, weighted_score * 100))

        return {
            'final_score': final_score,
            'subscores': subscores,
            'features': features
        }

In [None]:
# Load and preprocess data
transactions = pd.read_json('/content/compoundV2_transactions_ethereum_chunk.json')
processed_data = preprocess_data(transactions)

# Group by wallet
wallet_groups = processed_data.groupby('account.id')

# Initialize scorer
scorer = CreditScorer()

# Score each wallet
wallet_scores = {}
for wallet_id, wallet_txs in wallet_groups:
    features = calculate_features(wallet_txs)
    score_result = scorer.calculate_score(features)
    wallet_scores[wallet_id] = score_result

# Convert to DataFrame for analysis
scores_df = pd.DataFrame.from_dict(wallet_scores, orient='index')