In [2]:
import os
import pandas as pd
import numpy as np
import logging
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from concurrent.futures import ThreadPoolExecutor, as_completed
import cupy as cp  # For GPU-based computations (optional)

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
logger = logging.getLogger(__name__)

# File paths for saving features and labels
FEATURES_FILE = "features.csv"
LABELS_FILE = "labels.npy"

# Function to construct features for a single transaction
def process_transaction(transaction, providers, rate_dict):
    features = []
    labels = []
    
    # Fix the ambiguous truth value of a Series issue
    valid_providers = providers[
        (providers['CURRENCY'] == transaction['cur']) &
        (providers['LIMIT_MAX'] >= transaction['amount']) &
        (providers['MIN_SUM'] <= transaction['amount']) &
        (transaction['amount'] <= providers['MAX_SUM'])
    ]
    
    if valid_providers.empty:
        return features, labels  # Return empty if no valid providers
    
    for _, provider in valid_providers.iterrows():
        # Recalculate penalty only if LIMIT_MIN is not reached
        penalty_in_usd = 0
        if provider['LIMIT_MIN'] > 0:
            penalty_in_usd = max(
                0, 0.01 * provider['LIMIT_MIN'] * rate_dict.get(provider['CURRENCY'], 1)
            )
        
        features.append({
            'conversion': provider['CONVERSION'],
            'avg_time': provider['AVG_TIME'],
            'commission': provider['COMMISSION'] * transaction['amount'],
            'penalty': penalty_in_usd,
            'amount_in_usd': transaction['amount'] * rate_dict.get(transaction['cur'], 1),
            'limits_ratio': transaction['amount'] / provider['LIMIT_MAX'],
            'provider_id': provider['ID'],
            'transaction_id': transaction.name
        })
        labels.append(
            1 if (provider['CONVERSION'] > 0.5 and 
                  provider['AVG_TIME'] < 19 and 
                  provider['COMMISSION'] < 0.04 and 
                  transaction['amount'] <= provider['LIMIT_MAX'])
            else 0
        )
    return features, labels

# Parallelized Feature Engineering
def create_features(providers, transactions, rate_dict):
    if os.path.exists(FEATURES_FILE) and os.path.exists(LABELS_FILE):
        logger.info("Features and labels found. Loading from disk...")
        features = pd.read_csv(FEATURES_FILE)
        labels = np.load(LABELS_FILE)
        return features, labels

    features = []
    labels = []
    logger.info("Starting parallelized feature construction...")
    
    with ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(process_transaction, transaction, providers, rate_dict)
            for _, transaction in transactions.iterrows()
        ]
        
        for future in as_completed(futures):
            transaction_features, transaction_labels = future.result()
            features.extend(transaction_features)
            labels.extend(transaction_labels)
    
    logger.info("Feature construction completed.")
    
    # Save features and labels locally
    features_df = pd.DataFrame(features)
    features_df.to_csv(FEATURES_FILE, index=False)
    np.save(LABELS_FILE, labels)
    logger.info(f"Features saved to {FEATURES_FILE} and labels saved to {LABELS_FILE}.")
    
    return features_df, np.array(labels)

# Adjusted greedy_provider_selection
def greedy_provider_selection(transactions, providers, weight_factors):
    results = []
    logger.info("Starting greedy provider selection...")
    
    providers = providers.copy()  # Avoid modifying the original dataframe
    
    for i, transaction in transactions.iterrows():
        valid_providers = providers[
            (providers['LIMIT_MAX'] >= transaction['amount']) & 
            (providers['CURRENCY'] == transaction['cur']) &
            (providers['MIN_SUM'] <= transaction['amount']) &
            (transaction['amount'] <= providers['MAX_SUM'])
        ]
        
        if valid_providers.empty:
            logger.warning(f"Transaction {i}: No valid providers found. Skipping...")
            results.append({'payment': transaction['payment'], 'chain': '', 'score': 0})
            continue
        
        # Compute scores for providers
        valid_providers['score'] = (
            weight_factors['conversion'] * valid_providers['CONVERSION'] -
            weight_factors['avg_time'] * valid_providers['AVG_TIME'] +
            weight_factors['commission'] * (1 / (valid_providers['COMMISSION'] + 1e-9)) -
            weight_factors['penalty'] * valid_providers['LIMIT_MIN']
        )
        
        # Sort by score (descending) and pick the provider with the highest score
        best_provider = valid_providers.sort_values(by='score', ascending=False).iloc[0]
        
        # Update provider's limits dynamically
        providers.loc[providers['ID'] == best_provider['ID'], 'LIMIT_MIN'] -= transaction['amount']
        providers.loc[providers['ID'] == best_provider['ID'], 'LIMIT_MAX'] -= transaction['amount']
        
        # Ensure limits are not negative
        providers.loc[providers['ID'] == best_provider['ID'], 'LIMIT_MIN'] = providers['LIMIT_MIN'].clip(lower=0)
        providers.loc[providers['ID'] == best_provider['ID'], 'LIMIT_MAX'] = providers['LIMIT_MAX'].clip(lower=0)
        
        results.append({
            'payment': transaction['payment'],
            'chain': str(best_provider['ID']),
            'score': best_provider['score']
        })
        
        logger.info(f"Transaction {i}: Selected Provider {best_provider['ID']} with score {best_provider['score']:.2f}")
    
    logger.info("Greedy provider selection completed.")
    return pd.DataFrame(results)


# Load data
providers1 = pd.read_csv('providers_1.csv')
providers2 = pd.read_csv('providers_2.csv')
transactions1 = pd.read_csv('payments_1.csv')
transactions2 = pd.read_csv('payments_2.csv')
exchange_rates = pd.read_csv('ex_rates.csv')

# Concatenate provider and transaction data
providers = pd.concat([providers1, providers2], ignore_index=True).drop(columns='LIMIT_BY_CARD')
transactions = pd.concat([transactions1, transactions2], ignore_index=True)

# Create a dictionary for exchange rates
rate_dict = dict(zip(exchange_rates['destination'], exchange_rates['rate']))

# Example weights for factors
weight_factors = {
    'conversion': 0.5,
    'avg_time': 0.3,
    'commission': 0.1,
    'penalty': 0.1
}



In [3]:
logger.info("Starting the pipeline...")
features, labels = create_features(providers, transactions, rate_dict)
logger.info("Feature construction finished.")



2024-12-21 13:10:26,475 - Starting the pipeline...
2024-12-21 13:10:26,476 - Starting parallelized feature construction...


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
X = features.drop(columns=['provider_id', 'transaction_id'])
y = labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


logger.info("Model training completed.")
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))



In [None]:
# Greedy algorithm for provider selection
result = greedy_provider_selection(transactions, providers, weight_factors)
result.to_csv('output.csv', index=False)
logger.info("Pipeline completed successfully.")