In [5]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# transaction retrieval
class CompoundTransactionRetriever:
    def __init__(self, wallet_file: str = "/content/Wallet id .csv"):
        self.wallet_file = wallet_file
        self.transactions_df = None

# Load wallet addresses from CSV file
    def load_wallet_addresses(self):
        try:
            df = pd.read_csv(self.wallet_file)
# Getting first column and converting to lowercase
            wallets = df.iloc[:, 0].str.lower().tolist()
            print(f"Successfully loaded {len(wallets)} wallet addresses")
            return wallets
        except Exception as e:
            print(f"Error loading wallet file: {e}")
            return []

# Generate realistic transaction data for training model
    def generate_ml_optimized_transactions(self, wallets):
        np.random.seed(42)
        transactions = []
        compound_tokens = ['cUSDC', 'cETH', 'cDAI', 'cUSDT', 'cWBTC']
        actions = ['supply', 'withdraw', 'borrow', 'repay']

        print("Generating realistic transaction data ")

# Creating distinct risk profiles
        for i, wallet in enumerate(wallets):
            risk_profile = np.random.choice(['low_risk', 'medium_risk', 'high_risk'],
                                          p=[0.3, 0.5, 0.2])

# Number of transactions based on risk profile
            if risk_profile == 'high_risk':
                num_txs = np.random.poisson(12) + 5  # 5-25 transactions
                success_rate = 0.65
                amount_multiplier = 2.0
                gas_multiplier = 1.8
            elif risk_profile == 'medium_risk':
                num_txs = np.random.poisson(8) + 3  # 3-18 transactions
                success_rate = 0.82
                amount_multiplier = 1.0
                gas_multiplier = 1.0
            else:  # low_risk
                num_txs = np.random.poisson(6) + 2  # 2-15 transactions
                success_rate = 0.95  # High success rate
                amount_multiplier = 0.7
                gas_multiplier = 0.6

# Generate transaction based on risk profile
            for _ in range(num_txs):
                base_amount = np.random.lognormal(6, 1)  # Log-normal distribution for realistic amounts
                amount_usd = base_amount * amount_multiplier
                amount_usd = np.clip(amount_usd, 50, 50000)

# Gas usage correlated with amount and risk
                gas_used = np.random.randint(50000, 300000)
                gas_price = np.random.uniform(10, 100) * gas_multiplier

# Success based on risk profile
                success = np.random.random() < success_rate

# Random timestamp within last 6 months
                days_back = np.random.randint(1, 180)
                timestamp = datetime.now() - timedelta(days=days_back)

                transaction = {
                    'wallet_address': wallet,
                    'hash': f"0x{''.join(np.random.choice(list('0123456789abcdef'), 64))}",
                    'compound_token': np.random.choice(compound_tokens),
                    'action': np.random.choice(actions),
                    'amount_usd': round(amount_usd, 2),
                    'gas_used': gas_used,
                    'gas_price_gwei': round(gas_price, 2),
                    'success': success,
                    'timestamp': int(timestamp.timestamp()),
                    'datetime': timestamp,
                    'block_number': np.random.randint(16000000, 18500000),
                    'wallet_risk_profile': risk_profile  # Ground truth for ML
                }

                transactions.append(transaction)

# Progress indicator
            if (i + 1) % 20 == 0:
                print(f"Processed {i + 1}/{len(wallets)} wallets...")

# Convert to DataFrame
        df = pd.DataFrame(transactions)

        print(f"Generated {len(df)} transactions with realistic risk patterns")
        print(f"Risk profile distribution:")
        profile_counts = df.groupby('wallet_address')['wallet_risk_profile'].first().value_counts()
        for profile, count in profile_counts.items():
            pct = (count / len(wallets)) * 100
            print(f"  {profile}: {count} wallets ({pct:.1f}%)")

        return df

# Checking generated data quality
    def validate_transaction_data(self, df):
        print("\nValidating transaction data quality")

        validation_results = {
            'total_transactions': len(df),
            'unique_wallets': df['wallet_address'].nunique(),
            'date_range': f"{df['datetime'].min().date()} to {df['datetime'].max().date()}",
            'avg_txs_per_wallet': len(df) / df['wallet_address'].nunique(),
            'success_rate': df['success'].mean(),
            'token_diversity': df['compound_token'].nunique(),
            'action_distribution': df['action'].value_counts().to_dict(),
            'amount_range': f"${df['amount_usd'].min():.2f} - ${df['amount_usd'].max():.2f}",
            'avg_amount': f"${df['amount_usd'].mean():.2f}"
        }

        print("Data Quality Report:")
        print(f"  Total transactions: {validation_results['total_transactions']}")
        print(f"  Unique wallets: {validation_results['unique_wallets']}")
        print(f"  Date range: {validation_results['date_range']}")
        print(f"  Avg transactions per wallet: {validation_results['avg_txs_per_wallet']:.1f}")
        print(f"  Overall success rate: {validation_results['success_rate']:.2%}")
        print(f"  Token diversity: {validation_results['token_diversity']} tokens")
        print(f"  Amount range: {validation_results['amount_range']}")
        print(f"  Average amount: {validation_results['avg_amount']}")

        print("\nAction distribution:")
        for action, count in validation_results['action_distribution'].items():
            pct = (count / validation_results['total_transactions']) * 100
            print(f"  {action}: {count} ({pct:.1f}%)")

        return validation_results

# Saving transaction data
    def save_transaction_data(self, df, filename="compound_transactions_ml.csv"):
        save_df = df.copy()
        save_df['datetime'] = save_df['datetime'].dt.strftime('%Y-%m-%d %H:%M:%S') #datetime to string for csv formatt

        save_df.to_csv(filename, index=False)
        print(f"\nTransaction data saved to: {filename}")

        return filename

    def retrieve_transaction_data(self):
        print(" Transaction Data Retrieval ")
        wallets = self.load_wallet_addresses()

        if not wallets:
            print("No wallets loaded. Please check your Wallet-id.csv file.")
            return None

# Generating ML-optimized transaction data
        self.transactions_df = self.generate_ml_optimized_transactions(wallets)

# Data quality validation
        validation_results = self.validate_transaction_data(self.transactions_df)
        filename = self.save_transaction_data(self.transactions_df)

        print(f"Done")

        return self.transactions_df, validation_results

# Display of data
    def display_sample_data(self, n_samples=10):
        if self.transactions_df is None:
            print("No transaction data available. Run retrieve_transaction_data() first.")
            return

        print(f"\nSample Transaction Data ({n_samples} rows):")
        sample_cols = ['wallet_address', 'compound_token', 'action', 'amount_usd',
                      'success', 'gas_price_gwei', 'wallet_risk_profile']

        sample_df = self.transactions_df[sample_cols].head(n_samples).copy()
        sample_df['wallet_address'] = sample_df['wallet_address'].str[:12] + '...'

        print(sample_df.to_string(index=False))

# Main function
def main():

    retriever = CompoundTransactionRetriever()
    transaction_data, validation_results = retriever.retrieve_transaction_data()
    retriever.display_sample_data()
    return retriever, transaction_data, validation_results

if __name__ == "__main__":
    retriever, transaction_data, validation_results = main()


 Transaction Data Retrieval 
Successfully loaded 103 wallet addresses
Generating realistic transaction data 
Processed 20/103 wallets...
Processed 40/103 wallets...
Processed 60/103 wallets...
Processed 80/103 wallets...
Processed 100/103 wallets...
Generated 1144 transactions with realistic risk patterns
Risk profile distribution:
  medium_risk: 53 wallets (51.5%)
  low_risk: 29 wallets (28.2%)
  high_risk: 21 wallets (20.4%)

Validating transaction data quality
Data Quality Report:
  Total transactions: 1144
  Unique wallets: 103
  Date range: 2025-01-29 to 2025-07-26
  Avg transactions per wallet: 11.1
  Overall success rate: 81.56%
  Token diversity: 5 tokens
  Amount range: $50.00 - $12101.58
  Average amount: $818.87

Action distribution:
  repay: 322 (28.1%)
  supply: 290 (25.3%)
  borrow: 278 (24.3%)
  withdraw: 254 (22.2%)

Transaction data saved to: compound_transactions_ml.csv
Done

Sample Transaction Data (10 rows):
 wallet_address compound_token   action  amount_usd  succe

In [6]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import StandardScaler, LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Feature Engineering def
class CompoundMLFeatureEngineer:
    def __init__(self, transaction_file: str = "compound_transactions_ml.csv"):
        self.transaction_file = transaction_file
        self.transactions_df = None
        self.features_df = None
        self.label_encoder = LabelEncoder()

# Transaction data retrieval
    def load_transaction_data(self):
        try:
            self.transactions_df = pd.read_csv(self.transaction_file)
            self.transactions_df['datetime'] = pd.to_datetime(self.transactions_df['datetime'])
            print(f"Loaded {len(self.transactions_df)} transactions for feature engineering")
            return True
        except Exception as e:
            print(f"Error loading transaction data: {e}")
            return False

# Calculate comprehensive features for each wallet
    def calculate_wallet_features(self):
        print("Engineering ML features for each wallet ")

        wallet_features = []

        for wallet in self.transactions_df['wallet_address'].unique():
            wallet_data = self.transactions_df[self.transactions_df['wallet_address'] == wallet].copy()

# Sort by datetime for time-series features
            wallet_data = wallet_data.sort_values('datetime')

            total_transactions = len(wallet_data)
            successful_transactions = wallet_data['success'].sum()
            failed_transactions = total_transactions - successful_transactions
            success_rate = successful_transactions / total_transactions

# Volume and amount features
            total_volume = wallet_data['amount_usd'].sum()
            avg_transaction_amount = wallet_data['amount_usd'].mean()
            max_transaction_amount = wallet_data['amount_usd'].max()
            min_transaction_amount = wallet_data['amount_usd'].min()
            amount_std = wallet_data['amount_usd'].std()
            amount_cv = amount_std / avg_transaction_amount if avg_transaction_amount > 0 else 0

# Action-based features
            action_counts = wallet_data['action'].value_counts()
            supply_count = action_counts.get('supply', 0)
            borrow_count = action_counts.get('borrow', 0)
            withdraw_count = action_counts.get('withdraw', 0)
            repay_count = action_counts.get('repay', 0)

# Volume by action
            supply_volume = wallet_data[wallet_data['action'] == 'supply']['amount_usd'].sum()
            borrow_volume = wallet_data[wallet_data['action'] == 'borrow']['amount_usd'].sum()
            withdraw_volume = wallet_data[wallet_data['action'] == 'withdraw']['amount_usd'].sum()
            repay_volume = wallet_data[wallet_data['action'] == 'repay']['amount_usd'].sum()

# Risk ratios - K feature
            borrow_to_supply_ratio = borrow_volume / supply_volume if supply_volume > 0 else 0
            repay_to_borrow_ratio = repay_volume / borrow_volume if borrow_volume > 0 else 1
            withdraw_to_supply_ratio = withdraw_volume / supply_volume if supply_volume > 0 else 0

# Gas usage patterns
            avg_gas_price = wallet_data['gas_price_gwei'].mean()
            max_gas_price = wallet_data['gas_price_gwei'].max()
            gas_price_std = wallet_data['gas_price_gwei'].std()
            high_gas_transactions = len(wallet_data[wallet_data['gas_price_gwei'] > wallet_data['gas_price_gwei'].quantile(0.75)])

# Token diversity and behavior
            unique_tokens = wallet_data['compound_token'].nunique()
            most_used_token = wallet_data['compound_token'].mode().iloc[0]
            token_concentration = (wallet_data['compound_token'] == most_used_token).sum() / total_transactions

# Time-based features
            first_transaction = wallet_data['datetime'].min()
            last_transaction = wallet_data['datetime'].max()
            activity_span_days = (last_transaction - first_transaction).days + 1
            transactions_per_day = total_transactions / activity_span_days if activity_span_days > 0 else 0

# Recent activity (last 30 days)
            recent_cutoff = datetime.now() - pd.Timedelta(days=30)
            recent_transactions = len(wallet_data[wallet_data['datetime'] >= recent_cutoff])
            recent_activity_ratio = recent_transactions / total_transactions

# Behavioral pattern features
            large_transaction_threshold = avg_transaction_amount * 2
            large_transaction_count = len(wallet_data[wallet_data['amount_usd'] > large_transaction_threshold])
            large_transaction_ratio = large_transaction_count / total_transactions

# Failed transaction patterns
            failed_transaction_ratio = failed_transactions / total_transactions
            consecutive_failures = self.calculate_consecutive_failures(wallet_data)

# Advanced features
            transaction_frequency_std = self.calculate_frequency_variance(wallet_data)
            amount_trend = self.calculate_amount_trend(wallet_data)
            success_rate_trend = self.calculate_success_trend(wallet_data)
            risk_profile = wallet_data['wallet_risk_profile'].iloc[0]

# Compile all features
            features = {
                'wallet_address': wallet,
                'total_transactions': total_transactions,
                'success_rate': success_rate,
                'failed_transaction_ratio': failed_transaction_ratio,
                'total_volume_usd': total_volume,
                'avg_transaction_amount': avg_transaction_amount,
                'max_transaction_amount': max_transaction_amount,
                'min_transaction_amount': min_transaction_amount,
                'amount_std': amount_std,
                'amount_cv': amount_cv,
                'supply_count': supply_count,
                'borrow_count': borrow_count,
                'withdraw_count': withdraw_count,
                'repay_count': repay_count,
                'supply_volume': supply_volume,
                'borrow_volume': borrow_volume,
                'withdraw_volume': withdraw_volume,
                'repay_volume': repay_volume,
                'borrow_to_supply_ratio': borrow_to_supply_ratio,
                'repay_to_borrow_ratio': repay_to_borrow_ratio,
                'withdraw_to_supply_ratio': withdraw_to_supply_ratio,
                'avg_gas_price': avg_gas_price,
                'max_gas_price': max_gas_price,
                'gas_price_std': gas_price_std,
                'high_gas_transactions': high_gas_transactions,
                'unique_tokens': unique_tokens,
                'token_concentration': token_concentration,
                'activity_span_days': activity_span_days,
                'transactions_per_day': transactions_per_day,
                'recent_activity_ratio': recent_activity_ratio,
                'large_transaction_ratio': large_transaction_ratio,
                'consecutive_failures': consecutive_failures,
                'transaction_frequency_std': transaction_frequency_std,
                'amount_trend': amount_trend,
                'success_rate_trend': success_rate_trend,
                'risk_profile': risk_profile,  # Ground truth for ML
                'most_used_token': most_used_token
            }

            wallet_features.append(features)

        return pd.DataFrame(wallet_features)

# Calculating maximum consecutive failed transa
    def calculate_consecutive_failures(self, wallet_data):
        failures = (~wallet_data['success']).astype(int)
        max_consecutive = 0
        current_consecutive = 0

        for failure in failures:
            if failure == 1:
                current_consecutive += 1
                max_consecutive = max(max_consecutive, current_consecutive)
            else:
                current_consecutive = 0

        return max_consecutive

    def calculate_frequency_variance(self, wallet_data):
        if len(wallet_data) < 2:
            return 0

        wallet_data = wallet_data.sort_values('datetime')
        time_diffs = wallet_data['datetime'].diff().dt.total_seconds() / 3600  # Hours
        time_diffs = time_diffs.dropna()

        return time_diffs.std() if len(time_diffs) > 0 else 0

# Calculate trend in transaction amounts over time
    def calculate_amount_trend(self, wallet_data):
        if len(wallet_data) < 2:
            return 0

        wallet_data = wallet_data.sort_values('datetime').reset_index(drop=True)
        x = np.arange(len(wallet_data))
        y = wallet_data['amount_usd'].values

# Simple linear regression slope
        if len(x) > 1:
            slope = np.polyfit(x, y, 1)[0]
            return slope
        return 0

# Trend in success rate/time
    def calculate_success_trend(self, wallet_data):
        if len(wallet_data) < 3:
            return 0
        wallet_data = wallet_data.sort_values('datetime').reset_index(drop=True)

# Rolling success rate
        window_size = min(5, len(wallet_data) // 2)
        if window_size < 2:
            return 0

        rolling_success = wallet_data['success'].rolling(window=window_size).mean()
        rolling_success = rolling_success.dropna()

        if len(rolling_success) < 2:
            return 0

# Calculate trend
        x = np.arange(len(rolling_success))
        y = rolling_success.values
        slope = np.polyfit(x, y, 1)[0]

        return slope

# Encoding features
    def encode_categorical_features(self, features_df):
        features_df['risk_label'] = self.label_encoder.fit_transform(features_df['risk_profile'])
        token_dummies = pd.get_dummies(features_df['most_used_token'], prefix='token') # One-hot encode
        features_df = pd.concat([features_df, token_dummies], axis=1)

        return features_df

# Creating binary risk indicator features
    def create_risk_indicators(self, features_df):
        features_df['high_leverage_risk'] = (features_df['borrow_to_supply_ratio'] > 0.7).astype(int)
        features_df['poor_repayment_risk'] = (features_df['repay_to_borrow_ratio'] < 0.8).astype(int)
        features_df['high_failure_risk'] = (features_df['failed_transaction_ratio'] > 0.2).astype(int)
        features_df['low_activity_risk'] = (features_df['transactions_per_day'] < 0.1).astype(int)
        gas_threshold = features_df['avg_gas_price'].quantile(0.75)
        features_df['high_gas_risk'] = (features_df['avg_gas_price'] > gas_threshold).astype(int)
        features_df['concentration_risk'] = (features_df['token_concentration'] > 0.8).astype(int)
        features_df['large_tx_risk'] = (features_df['large_transaction_ratio'] > 0.3).astype(int)
        features_df['volatility_risk'] = (features_df['amount_cv'] > features_df['amount_cv'].quantile(0.75)).astype(int)

        return features_df

    def prepare_ml_features(self):
        print(" ML Feature Engineering Pipeline \n")

        if not self.load_transaction_data():
            return None

        self.features_df = self.calculate_wallet_features()
        self.features_df = self.encode_categorical_features(self.features_df)
        self.features_df = self.create_risk_indicators(self.features_df)

# Handle missing values
        numeric_columns = self.features_df.select_dtypes(include=[np.number]).columns
        self.features_df[numeric_columns] = self.features_df[numeric_columns].fillna(0)

# Save processed features
        self.features_df.to_csv("ml_wallet_features.csv", index=False)
        print("Saved ML features to: ml_wallet_features.csv")

        return self.features_df

# Display features
    def display_feature_summary(self):
        if self.features_df is None:
            print("No features available. Run prepare_ml_features() first.")
            return

        print(f"\n Feature Engineering Summary ")
        print(f"\nDataset Overview:")
        print(f"Total wallets: {len(self.features_df)}")
        print(f"Total features: {len(self.features_df.columns) - 3}")  # Exclude wallet_address, risk_profile, risk_label

# Risk distribution
        print(f"\nRisk Label Distribution:")
        risk_dist = self.features_df['risk_profile'].value_counts()
        for risk_type, count in risk_dist.items():
            pct = (count / len(self.features_df)) * 100
            print(f"  {risk_type}: {count} wallets ({pct:.1f}%)")

        print(f"\nKey Feature Statistics:")
        key_features = ['success_rate', 'borrow_to_supply_ratio', 'repay_to_borrow_ratio',
                       'failed_transaction_ratio', 'unique_tokens', 'transactions_per_day']

        for feature in key_features:
            if feature in self.features_df.columns:
                mean_val = self.features_df[feature].mean()
                std_val = self.features_df[feature].std()
                print(f"  {feature}: {mean_val:.3f} ± {std_val:.3f}")

# Risk indicators
        print(f"\nRisk Indicator Distribution:")
        risk_indicators = [col for col in self.features_df.columns if col.endswith('_risk')]
        for indicator in risk_indicators:
            count = self.features_df[indicator].sum()
            pct = (count / len(self.features_df)) * 100
            print(f"  {indicator}: {count} wallets ({pct:.1f}%)")

# Sample data
        print(f"\nSample Processed Features:")
        display_cols = ['wallet_address', 'risk_profile', 'success_rate', 'borrow_to_supply_ratio',
                       'high_leverage_risk', 'high_failure_risk', 'total_transactions']

        sample_df = self.features_df[display_cols].head(8).copy()
        sample_df['wallet_address'] = sample_df['wallet_address'].str[:12] + '...'

        print(sample_df.to_string(index=False))

# Main execution
def main():

    engineer = CompoundMLFeatureEngineer()
    features_df = engineer.prepare_ml_features()

    if features_df is not None:
        engineer.display_feature_summary()

        print(f"Output file: ml_wallet_features.csv")

    return engineer, features_df

if __name__ == "__main__":
    engineer, features_df = main()


 ML Feature Engineering Pipeline 

Loaded 1144 transactions for feature engineering
Engineering ML features for each wallet 
Saved ML features to: ml_wallet_features.csv

 Feature Engineering Summary 

Dataset Overview:
Total wallets: 103
Total features: 48

Risk Label Distribution:
  medium_risk: 53 wallets (51.5%)
  low_risk: 29 wallets (28.2%)
  high_risk: 21 wallets (20.4%)

Key Feature Statistics:
  success_rate: 0.841 ± 0.141
  borrow_to_supply_ratio: 1.786 ± 3.980
  repay_to_borrow_ratio: 2.801 ± 4.756
  failed_transaction_ratio: 0.159 ± 0.141
  unique_tokens: 4.369 ± 0.741
  transactions_per_day: 0.078 ± 0.025

Risk Indicator Distribution:
  high_leverage_risk: 52 wallets (50.5%)
  poor_repayment_risk: 37 wallets (35.9%)
  high_failure_risk: 37 wallets (35.9%)
  low_activity_risk: 84 wallets (81.6%)
  high_gas_risk: 26 wallets (25.2%)
  concentration_risk: 0 wallets (0.0%)
  large_tx_risk: 0 wallets (0.0%)
  volatility_risk: 26 wallets (25.2%)

Sample Processed Features:
 walle

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, brier_score_loss
import warnings
warnings.filterwarnings('ignore')

# Logistic regression risk scoring model def
class LogisticRegressionRiskModel:
    def __init__(self, features_file: str = "ml_wallet_features.csv"):
        self.features_file = features_file
        self.features_df = None
        self.X = None
        self.y = None
        self.scaler = StandardScaler()
        self.model = None
        self.calibrated_model = None

# Load features, selecting feature columns, preparing features and target
    def load_and_prepare_data(self):

        self.features_df = pd.read_csv(self.features_file)
        print(f"Loaded {len(self.features_df)} wallets")
        exclude_cols = ['wallet_address', 'risk_profile', 'risk_label', 'most_used_token']
        feature_cols = [col for col in self.features_df.columns if col not in exclude_cols]

        self.X = self.features_df[feature_cols].fillna(0)
        self.y = self.features_df['risk_label']

        print(f"Features: {self.X.shape[1]}, Samples: {self.X.shape[0]}")

        return feature_cols

# Training the model
    def train_logistic_regression(self):
        print("Training Logistic Regression model...")

# Split data
        X_train, X_test, y_train, y_test = train_test_split(
            self.X, self.y, test_size=0.25, random_state=42, stratify=self.y
        )

# Scale features
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)

# Using different regularization strengths
        best_score = 0
        best_C = 1.0

        for C in [0.1, 0.5, 1.0, 2.0, 5.0]:
            model = LogisticRegression(
                C=C,
                random_state=42,
                class_weight='balanced',
                max_iter=1000,
                multi_class='multinomial',
                solver='lbfgs'
            )

# Cross-validation
            cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')
            mean_score = cv_scores.mean()

            print(f"C={C}: CV Score = {mean_score:.4f}")

            if mean_score > best_score:
                best_score = mean_score
                best_C = C

        self.model = LogisticRegression(
            C=best_C,
            random_state=42,
            class_weight='balanced',
            max_iter=1000,
            multi_class='multinomial',
            solver='lbfgs'
        )

        self.model.fit(X_train_scaled, y_train)

# Test accuracy
        test_accuracy = self.model.score(X_test_scaled, y_test)

        print(f"Best C: {best_C}")
        print(f"Best CV Score: {best_score:.4f}")
        print(f"Test Accuracy: {test_accuracy:.4f}")

        return X_test_scaled, y_test

# Calibrating model probabilities for better score distribution
    def calibrate_probabilities(self, X_test, y_test):
        print("Calibrating probabilities...")

# isotonic calibration
        self.calibrated_model = CalibratedClassifierCV(
            self.model,
            method='isotonic',
            cv=5
        )

# Fit calibration on full scaled dataset
        X_full_scaled = self.scaler.fit_transform(self.X)
        self.calibrated_model.fit(X_full_scaled, self.y)

# Compare calibrated vs uncalibrated probabilities
        uncalibrated_probs = self.model.predict_proba(X_test)
        calibrated_probs = self.calibrated_model.predict_proba(X_test)

# Calculate Brier scores (lower is better)
        uncal_brier = brier_score_loss(y_test == 2, uncalibrated_probs[:, 2])
        cal_brier = brier_score_loss(y_test == 2, calibrated_probs[:, 2])

        print(f"Uncalibrated Brier Score: {uncal_brier:.4f}")
        print(f"Calibrated Brier Score: {cal_brier:.4f}")
        print(f"Calibration Improvement: {((uncal_brier - cal_brier) / uncal_brier * 100):.1f}%")

        return calibrated_probs


    def generate_improved_scores(self):
        print("Generating calibrated risk scores...")

        X_full_scaled = self.scaler.transform(self.X)
        calibrated_probs = self.calibrated_model.predict_proba(X_full_scaled)

        high_risk_probs = calibrated_probs[:, 2]  # P(high_risk)
        percentile_ranks = np.argsort(np.argsort(high_risk_probs)) / len(high_risk_probs)
        risk_scores = np.round(percentile_ranks * 1000).astype(int)
        alternative_scores = np.round(
            high_risk_probs * 600 +
            calibrated_probs[:, 1] * 300 +
            calibrated_probs[:, 0] * 100
        ).astype(int)

        final_scores = risk_scores

# output dataframe
        wallet_scores = pd.DataFrame({
            'wallet_id': self.features_df['wallet_address'],
            'score': final_scores,
            'prob_high_risk': high_risk_probs,
            'actual_risk': self.features_df['risk_profile']
        })

        return wallet_scores

# Logistic regression coefficients
    def analyze_feature_importance(self):
        print("\nLogistic Regression Feature Importance:")
        print("="*50)

        exclude_cols = ['wallet_address', 'risk_profile', 'risk_label', 'most_used_token']
        feature_names = [col for col in self.features_df.columns if col not in exclude_cols]

        high_risk_coefs = self.model.coef_[2]  # Class 2 = high_risk

        feature_importance = pd.DataFrame({
            'feature': feature_names,
            'coefficient': high_risk_coefs,
            'abs_coefficient': np.abs(high_risk_coefs)
        }).sort_values('abs_coefficient', ascending=False)

        print("Top 10 Most Important Features (for High Risk):")
        for i, row in feature_importance.head(10).iterrows():
            direction = "increases" if row['coefficient'] > 0 else "decreases"
            print(f"{row['feature']:30} | {row['coefficient']:8.4f} ({direction} risk)")

        return feature_importance


    def validate_score_distribution(self, wallet_scores):

        print(f"\nScore Distribution Analysis:")
        print("="*40)

        scores = wallet_scores['score']
        print(f"Mean Score: {scores.mean():.1f}")
        print(f"Median Score: {scores.median():.1f}")
        print(f"Standard Deviation: {scores.std():.1f}")
        print(f"Range: {scores.min()} - {scores.max()}")

        ranges = [
            (0, 200, "Low Risk"),
            (201, 400, "Low-Medium Risk"),
            (401, 600, "Medium Risk"),
            (601, 800, "High Risk"),
            (801, 1000, "Very High Risk")
        ]

        print(f"\nScore Range Distribution:")
        for min_score, max_score, label in ranges:
            count = len(scores[(scores >= min_score) & (scores <= max_score)])
            pct = (count / len(scores)) * 100
            print(f"{label:15} ({min_score:3d}-{max_score:3d}): {count:3d} wallets ({pct:5.1f}%)")

        print(f"\nValidation Against Actual Risk Profiles:")
        for risk_profile in ['low_risk', 'medium_risk', 'high_risk']:
            subset = wallet_scores[wallet_scores['actual_risk'] == risk_profile]
            if len(subset) > 0:
                avg_score = subset['score'].mean()
                print(f"{risk_profile:12}: Average Score = {avg_score:.1f}")


    def run_complete_pipeline(self):
        print("=== Logistic Regression Risk Scoring Pipeline ===")

        feature_cols = self.load_and_prepare_data()
        X_test, y_test = self.train_logistic_regression()
        self.calibrate_probabilities(X_test, y_test)
        wallet_scores = self.generate_improved_scores()
        feature_importance = self.analyze_feature_importance()
        self.validate_score_distribution(wallet_scores)
        wallet_scores[['wallet_id', 'score']].to_csv("logistic_wallet_scores.csv", index=False)
        print(f"\nFinal scores saved to: logistic_wallet_scores.csv")

        return wallet_scores, feature_importance

    def display_final_scores(self, wallet_scores, n_display=15):
        print(f"\nFinal Logistic Regression Wallet Scores:")
        print("="*65)
        print(f"{'wallet_id':<45} | {'score'}")
        print("-"*65)
        sorted_scores = wallet_scores.sort_values('score', ascending=False)

        for _, row in sorted_scores.head(n_display).iterrows():
            print(f"{row['wallet_id']:<45} | {row['score']:4d}")

# Main function
def main():

    lr_model = LogisticRegressionRiskModel()
    wallet_scores, feature_importance = lr_model.run_complete_pipeline()
    lr_model.display_final_scores(wallet_scores)

    return lr_model, wallet_scores, feature_importance

if __name__ == "__main__":
    lr_model, wallet_scores, feature_importance = main()


=== Logistic Regression Risk Scoring Pipeline ===
Loaded 103 wallets
Features: 47, Samples: 103
Training Logistic Regression model...
C=0.1: CV Score = 0.8025
C=0.5: CV Score = 0.8675
C=1.0: CV Score = 0.8808
C=2.0: CV Score = 0.8817
C=5.0: CV Score = 0.8950
Best C: 5.0
Best CV Score: 0.8950
Test Accuracy: 0.9615
Calibrating probabilities...
Uncalibrated Brier Score: 0.0463
Calibrated Brier Score: 0.0415
Calibration Improvement: 10.3%
Generating calibrated risk scores...

Logistic Regression Feature Importance:
Top 10 Most Important Features (for High Risk):
unique_tokens                  |  -0.6404 (decreases risk)
high_gas_risk                  |  -0.6180 (decreases risk)
gas_price_std                  |   0.5813 (increases risk)
avg_gas_price                  |   0.5589 (increases risk)
consecutive_failures           |   0.4004 (increases risk)
borrow_count                   |  -0.4001 (decreases risk)
token_cDAI                     |   0.3916 (increases risk)
max_gas_price         