In [None]:
import pandas as pd
import numpy as np
import logging
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score
import joblib  # For saving and loading models

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
logger = logging.getLogger(__name__)

# Load features and labels
FEATURES_FILE = "features.csv"
LABELS_FILE = "labels.npy"

# Load data
logger.info("Loading data...")
features = pd.read_csv(FEATURES_FILE)
labels = np.load(LABELS_FILE)

# Drop unnecessary columns
X = features.drop(columns=['provider_id', 'transaction_id', 'eventTimeRes'])
y = labels

# Train-test split
logger.info("Splitting data into training and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Model training
logger.info("Training RandomForestClassifier...")
model = RandomForestClassifier(random_state=42)

# Hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='f1', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_
logger.info(f"Best parameters: {grid_search.best_params_}")

# Save the model
MODEL_FILE = "decision_model.pkl"
joblib.dump(best_model, MODEL_FILE)
logger.info(f"Model saved to {MODEL_FILE}")

# Evaluate on test set
logger.info("Evaluating the model...")
y_pred = best_model.predict(X_test)
y_pred_prob = best_model.predict_proba(X_test)[:, 1]

print("Classification Report:")
print(classification_report(y_test, y_pred))
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_prob):.4f}")



In [None]:
# Predict for new transactions
def predict_best_provider(new_transaction, providers, model, rate_dict):
    """
    Predicts the best provider for a new transaction based on the trained model.

    Parameters:
        new_transaction (dict): Transaction details.
        providers (pd.DataFrame): Providers data.
        model (sklearn model): Trained decision model.
        rate_dict (dict): Currency exchange rates.

    Returns:
        str: Best provider ID.
    """
    # Process new transaction features
    transaction_time = pd.to_datetime(new_transaction['eventTimeRes'])
    valid_providers = providers[
        (providers['LIMIT_MAX'] >= new_transaction['amount']) &
        (providers['CURRENCY'] == new_transaction['cur']) &
        (providers['MIN_SUM'] <= new_transaction['amount']) &
        (new_transaction['amount'] <= providers['MAX_SUM']) &
        (providers['TIME'].apply(lambda x: (transaction_time - pd.to_datetime(x)).total_seconds() >= 0))
    ]
    
    if valid_providers.empty:
        return "No valid provider found"
    
    features = []
    for _, provider in valid_providers.iterrows():
        penalty_in_usd = max(
            0, 0.01 * provider['LIMIT_MIN'] * rate_dict.get(provider['CURRENCY'], 1)
        )
        features.append({
            'conversion': provider['CONVERSION'],
            'avg_time': provider['AVG_TIME'],
            'commission': provider['COMMISSION'] * new_transaction['amount'],
            'penalty': penalty_in_usd,
            'amount_in_usd': new_transaction['amount'] * rate_dict.get(new_transaction['cur'], 1),
            'limits_ratio': new_transaction['amount'] / provider['LIMIT_MAX']
        })
    
    # Convert features to DataFrame
    features_df = pd.DataFrame(features)
    
    # Predict success probabilities
    probabilities = model.predict_proba(features_df)[:, 1]
    valid_providers['success_prob'] = probabilities
    
    # Select the best provider
    best_provider = valid_providers.sort_values(by='success_prob', ascending=False).iloc[0]
    return best_provider['ID']

# Example: Predict for a new transaction
new_transaction = {
    'eventTimeRes': '2024-11-25 20:59:46',
    'amount': 50,
    'cur': 'USD'
}
providers = pd.read_csv('providers_1.csv')  # Load providers
exchange_rates = pd.read_csv('ex_rates.csv')
rate_dict = dict(zip(exchange_rates['destination'], exchange_rates['rate']))

# Load the model
trained_model = joblib.load(MODEL_FILE)
best_provider = predict_best_provider(new_transaction, providers, trained_model, rate_dict)
logger.info(f"Best provider for transaction: {best_provider}")
