In [5]:
import logging
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
import numpy as np
from joblib import load
from typing import List, Dict, Any

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")
logger = logging.getLogger(__name__)

FEATURES_FILE = "features_test.csv"
LABELS_FILE = "labels_test.npy"
RESULTS_FILE = "results.csv"
MODEL_FILE = "trained_model.joblib"  # Path to the pre-trained model

# Helper function: Safe datetime parsing
def safe_parse_datetime(date_str: str) -> datetime:
    try:
        return datetime.fromisoformat(date_str)
    except ValueError:
        logger.error(f"Invalid datetime format: {date_str}")
        return None

# Preprocess providers with logic for dynamic LIMIT_MIN and LIMIT_MAX updates
def preprocess_providers(providers: pd.DataFrame) -> pd.DataFrame:
    if 'LIMIT_MIN' not in providers.columns or 'LIMIT_MAX' not in providers.columns:
        logger.error("LIMIT_MIN or LIMIT_MAX column is missing in providers file.")
        raise KeyError("Missing required columns in providers data.")

    providers["TIME"] = pd.to_datetime(providers["TIME"])
    providers = providers.sort_values(by=["ID", "TIME"], ascending=[True, True])
    grouped = providers.groupby(["ID", providers["TIME"].dt.date])
    updated_groups = []

    for (provider_id, date), group in grouped:
        earliest_row = group.iloc[0]
        min_limit = earliest_row["LIMIT_MIN"]
        max_limit = earliest_row["LIMIT_MAX"]
        group["LIMIT_MIN"] = min_limit
        group["LIMIT_MAX"] = max_limit
        updated_groups.append(group)

    updated_providers = pd.concat(updated_groups)
    logger.info("Providers processed with dynamic LIMIT_MIN and LIMIT_MAX updates.")
    return updated_providers.reset_index(drop=True)

def create_features(
    providers: pd.DataFrame,
    transactions: pd.DataFrame,
    rate_dict: Dict[str, float],
    features_file: str,
    labels_file: str,
) -> (pd.DataFrame, pd.Series):
    all_features = []
    all_labels = []

    # Check if the user wants to regenerate features
    regenerate = input("Do you want to regenerate features? (1 for yes, 0 for no): ").strip()
    if regenerate == "0":
        try:
            # Attempt to load existing features
            features = pd.read_csv(features_file)
            labels = np.load(labels_file, allow_pickle=True)
            all_features.extend(features.to_dict(orient="records"))
            all_labels.extend(labels)
        except FileNotFoundError:
            logger.error("Features or labels file not found. Regenerating features.")
            regenerate = "1"

    if regenerate == "1":
        with ThreadPoolExecutor() as executor:
            futures = [
                executor.submit(process_transaction, transaction, providers, rate_dict)
                for _, transaction in transactions.iterrows()
            ]
            for future in as_completed(futures):
                features_and_labels = future.result()
                if features_and_labels:
                    features, labels = zip(*features_and_labels)
                    all_features.extend(features)
                    all_labels.extend(labels)

        # Save newly generated features and labels
        features_df = pd.DataFrame(all_features)
        features_df.to_csv(features_file, index=False)
        np.save(labels_file, all_labels)
        logger.info(f"Features saved to {features_file}")
        logger.info(f"Labels saved to {labels_file}")
    else:
        features_df = pd.DataFrame(all_features)

    return features_df, pd.Series(all_labels)


# Function to process a single transaction
def process_transaction(transaction: pd.Series, providers: pd.DataFrame, rate_dict: Dict[str, float]) -> List[Dict[str, Any]]:
    features = []
    time_of_transaction = safe_parse_datetime(transaction["eventTimeRes"])
    if not time_of_transaction:
        return []

    transaction_amount_in_usd = transaction["amount"] * rate_dict.get(transaction["cur"], 1)

    for _, provider in providers.iterrows():
        if (transaction["amount"] <= provider["LIMIT_MAX"]
            and provider["CURRENCY"] == transaction["cur"]
            and transaction["amount"] <= provider["MAX_SUM"]
            and transaction["amount"] >= provider["MIN_SUM"]):
            
            penalty = max(0, (provider["LIMIT_MIN"] * 0.01))
            success_score = ((provider["CONVERSION"] > 0.5)
                             + (provider["AVG_TIME"] < 20)
                             + (provider["COMMISSION"] <= 0.04)
                             + (penalty < 600)
                             + (transaction_amount_in_usd <= 20))
            
            success = 1 if success_score >= 3 else 0
            feature = {
                "conversion": provider["CONVERSION"],
                "avg_time": provider["AVG_TIME"],
                "commission": provider["COMMISSION"] * transaction["amount"],
                "penalty": penalty,
                "amount_in_usd": transaction_amount_in_usd,
                "limits_ratio": transaction["amount"] / provider["LIMIT_MAX"],
                "provider_id": provider["ID"],
                "transaction_id": transaction.name,
            }
            features.append((feature, success))
    return features

# Updated function to predict flow and create chains
def predict_flow(model, transaction, providers, rates, features_df) -> pd.DataFrame:
    try:
        results = []
        transaction_features = features_df[features_df["transaction_id"] == transaction.name].copy()
        if transaction_features.empty:
            return pd.DataFrame([{"payment": transaction["payment"], "chain": "Z", "status": "Failed"}])

        # Calculate probabilities and sort providers
        transaction_features["probability"] = model.predict_proba(
            transaction_features[["conversion", "avg_time", "commission", "penalty", "amount_in_usd"]]
        )[:, 1]
        sorted_features = transaction_features.sort_values("probability", ascending=False)

        chain = []
        status = "Failed"
        remaining_amount = transaction["amount"]
        for _, feature in sorted_features.iterrows():
            provider_id = feature["provider_id"]
            chain.append(str(provider_id))

            # Check provider limits and update dynamically
            provider = providers.loc[providers["ID"] == provider_id]
            if provider.empty:
                logger.warning(f"No data found for provider ID {provider_id}.")
                continue

            provider_idx = provider.index[0]
            # Update provider's time by adding the average processing time
            avg_time = providers.loc[provider_idx, "AVG_TIME"]
            providers.loc[provider_idx, "TIME"] += pd.Timedelta(seconds=avg_time)
            if (providers.loc[provider_idx, "LIMIT_MAX"] >= remaining_amount):
                # Update provider limits dynamically
                providers.loc[provider_idx, "LIMIT_MAX"] -= remaining_amount
                providers.loc[provider_idx, "LIMIT_MIN"] -= remaining_amount
                status = "Captured"
                break
        chain_str = "-".join(chain)
        results.append({
            "payment": transaction["payment"],
            "chain": chain_str if chain else "Z",
            "status": status,
        })

        return pd.DataFrame(results)

    except Exception as e:
        logger.error(f"Error in predict_flow: {e}")
        return pd.DataFrame()

# Main function
def main(providers_file: str, payments_file: str, rates_file: str, features_file: str, labels_file: str):
    try:
        logger.info("Loading data...")
        providers = pd.read_csv(providers_file)
        transactions = pd.read_csv(payments_file)
        rates = pd.read_csv(rates_file).set_index("destination").to_dict()["rate"]

        logger.info("Preprocessing providers...")
        providers = preprocess_providers(providers)

        logger.info("Creating or loading features...")
        features_df, labels = create_features(providers, transactions, rates, features_file, labels_file)

        logger.info("Loading model...")
        model = load(MODEL_FILE)

        logger.info("Processing transactions and generating results...")
        all_results = []
        for _, transaction in transactions.iterrows():
            result = predict_flow(model, transaction, providers, rates, features_df)
            all_results.append(result)

        results_df = pd.concat(all_results, ignore_index=True)
        results_df.to_csv(RESULTS_FILE, index=False)
        logger.info(f"Results saved to {RESULTS_FILE}")

    except Exception as e:
        logger.error(f"Error in main: {e}")
        raise


if __name__ == "__main__":
    providers_file = "providers_2.csv"
    payments_file = "payments_2.csv"
    rates_file = "ex_rates.csv"
    main(providers_file, payments_file, rates_file, FEATURES_FILE, LABELS_FILE)


2024-12-22 10:15:16,760 - Loading data...
2024-12-22 10:15:17,053 - Preprocessing providers...
2024-12-22 10:15:17,083 - Providers processed with dynamic LIMIT_MIN and LIMIT_MAX updates.
2024-12-22 10:15:17,084 - Creating or loading features...
2024-12-22 10:15:22,134 - Loading model...
2024-12-22 10:15:22,156 - Processing transactions and generating results...
2024-12-22 10:29:13,093 - Results saved to results.csv
