In [10]:
import pandas as pd
import numpy as np
import joblib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import IsolationForest
from xgboost import XGBClassifier

# 1. Custom transformer for feature engineering
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        if 'User_id' in X.columns:
            self.user_mean_amount = X.groupby('User_id')['amount'].mean()
        else:
            self.user_mean_amount = None
        return self

    def transform(self, X):
        X = X.copy()
        # transaction_hour
        if 'TimeStamp' in X.columns:
            X['transaction_hour'] = pd.to_datetime(X['TimeStamp']).dt.hour
        # amount_log
        X['amount_log'] = np.log1p(X['amount'])
        # amount_to_avg
        if self.user_mean_amount is not None and 'User_id' in X.columns:
            X['amount_to_avg'] = X.apply(
                lambda row: row['amount'] / self.user_mean_amount.get(row['User_id'], row['amount']),
                axis=1
            )
        else:
            X['amount_to_avg'] = 1.0
        # new_device_flag
        if 'User_id' in X.columns and 'device' in X.columns:
            X['new_device_flag'] = (
                X.groupby('User_id')['device'].apply(lambda s: s != s.shift(1)).astype(int)
            ).fillna(0).values
        else:
            X['new_device_flag'] = 0
        # hour_sin, hour_cos
        X['hour_sin'] = np.sin(2 * np.pi * X['transaction_hour'] / 24)
        X['hour_cos'] = np.cos(2 * np.pi * X['transaction_hour'] / 24)
        return X

# 2. Define features
numerical_features = [
    'account_age_days', 'amount', 'quantity', 'total_value',
    'num_trans_24h', 'num_failed_24h', 'no_of_cards_from_ip',
    'transaction_hour', 'amount_log', 'amount_to_avg', 'new_device_flag', 'hour_sin', 'hour_cos'
]
categorical_features = [
    'payment_method', 'device', 'category'
]

# 3. Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# 4. Full pipeline
full_pipeline = Pipeline([
    ('feature_engineering', FeatureEngineer()),
    ('preprocessing', preprocessor)
])

# 5. Load and preprocess data
def load_and_preprocess_data(file_path):
    df = pd.read_csv(file_path)
    X = df.drop(['fraud_label', 'fraud_pattern', 'Transaction_id', 
                 'billing_address', 'shipping_address', 'ip_address', 'geo_location'], axis=1)
    y = df['fraud_label']
    return X, y, df

# 6. Train and save model
X, y, df = load_and_preprocess_data('fraud_transactions_new_data.csv')
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)
full_pipeline.fit(X_train)
X_train_processed = full_pipeline.transform(X_train)
X_test_processed = full_pipeline.transform(X_test)

xgb_model = XGBClassifier(
    scale_pos_weight=45,
    n_estimators=500,
    max_depth=7,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='auc'
)
xgb_model.fit(X_train_processed, y_train)

iso_model = IsolationForest(
    n_estimators=500,
    contamination=0.03,
    max_samples=256,
    random_state=42
)
iso_model.fit(X_train_processed)
iso_scores = iso_model.decision_function(X_train_processed)
iso_thresh = np.percentile(iso_scores, 3)

# Save feature names for SHAP/explanation
cat_feature_names = list(full_pipeline.named_steps['preprocessing'].transformers_[1][1].get_feature_names_out(categorical_features))
feature_names = numerical_features + cat_feature_names

model_package = {
    'xgb_model': xgb_model,
    'iso_model': iso_model,
    'pipeline': full_pipeline,
    'iso_thresh': iso_thresh,
    'feature_names': feature_names   # <-- add this line
}
joblib.dump(model_package, 'hybrid_model.pkl')
cat_feature_names = list(
    full_pipeline.named_steps['preprocessing']
    .transformers_[1][1]
    .get_feature_names_out(categorical_features)
)
feature_names = numerical_features + cat_feature_names

model_package = {
    'xgb_model': xgb_model,
    'iso_model': iso_model,
    'pipeline': full_pipeline,
    'iso_thresh': iso_thresh,
    'feature_names': feature_names  # <-- This line is critical!
}
joblib.dump(model_package, 'hybrid_model.pkl')
# 7. Prediction class
import shap

class FraudPredictor:
    def __init__(self, model_path):
        self.model_data = joblib.load(model_path)
        self.xgb_model = self.model_data['xgb_model']
        self.iso_model = self.model_data['iso_model']
        self.pipeline = self.model_data['pipeline']
        self.iso_thresh = self.model_data['iso_thresh']
        self.feature_names = self.model_data['feature_names']
        self.xgb_explainer = shap.TreeExplainer(self.xgb_model)
        
    def predict(self, transaction_data):
        processed = self.pipeline.transform(pd.DataFrame([transaction_data]))
        xgb_prob = self.xgb_model.predict_proba(processed)[0][1]
        iso_score = self.iso_model.decision_function(processed)[0]
        
        # Decision logic
        if xgb_prob > 0.9:
            decision = "FRAUD"
        elif (xgb_prob > 0.6 and iso_score < self.iso_thresh):
            decision = "NEED TO TAKE FEEDBACK"
        else:
            decision = "GENUINE"
        
        # SHAP explanation
        shap_values = self.xgb_explainer.shap_values(processed)[0]
        indicators = []
        total_impact = sum(np.abs(shap_values))
        for i, val in enumerate(shap_values):
            indicators.append({
                'feature': self.feature_names[i] if i < len(self.feature_names) else f'feature_{i}',
                'value': float(processed[0][i]),
                'impact_percent': round((abs(val)/total_impact)*100, 2) if total_impact else 0.0
            })
        indicators = sorted(indicators, key=lambda x: x['impact_percent'], reverse=True)[:5]
        
        # (Optional) Use the top indicator as a "fraud pattern" for interpretability
        fraud_pattern = indicators[0]['feature'] if indicators else "unknown"
        
        return {
            'decision': decision,
            'probability': round(float(xgb_prob), 4),
            'anomaly_score': round(float(iso_score), 4),
            'fraud_indicators': indicators,
            'fraud_pattern': fraud_pattern,
            'thresholds': {
                'xgb_high': 0.9,
                'xgb_feedback': 0.6,
                'iso_threshold': round(float(self.iso_thresh), 4)
            }
        }

# Usage example (after loading the model, just give raw parameters):
fraud_predictor = FraudPredictor('hybrid_model.pkl')
sample_transaction = {
    'account_age_days': 30,
    'payment_method': 'Credit Card',
    'device': 'Laptop',
    'category': 'Electronics',
    'amount': 35000.0,
    'quantity': 1,
    'total_value': 35000.0,
    'num_trans_24h': 1,
    'num_failed_24h': 0,
    'no_of_cards_from_ip': 1,
    'User_id': 123,
    'TimeStamp': '2024-06-24 14:00:00'
}
prediction = fraud_predictor.predict(sample_transaction)
print(prediction)

{'decision': 'GENUINE', 'probability': 0.0196, 'anomaly_score': -0.0135, 'fraud_indicators': [{'feature': 'amount_to_avg', 'value': 57.23007937010643, 'impact_percent': np.float32(19.59)}, {'feature': 'num_trans_24h', 'value': -1.0290281810780284, 'impact_percent': np.float32(12.92)}, {'feature': 'total_value', 'value': 1.0461800555450884, 'impact_percent': np.float32(12.07)}, {'feature': 'account_age_days', 'value': -1.1876666572786856, 'impact_percent': np.float32(11.43)}, {'feature': 'quantity', 'value': -0.7966284471819609, 'impact_percent': np.float32(7.63)}], 'fraud_pattern': 'amount_to_avg', 'thresholds': {'xgb_high': 0.9, 'xgb_feedback': 0.6, 'iso_threshold': 0.0}}


In [11]:
fraud_predictor = FraudPredictor('hybrid_model.pkl')
sample_transaction = {
    'account_age_days': 2,
    'payment_method': 'Credit Card',
    'device': 'Laptop',
    'category': 'Electronics',
    'amount': 35000.0,
    'quantity': 3,
    'total_value': 35000.0,
    'num_trans_24h': 1,
    'num_failed_24h': 1,
    'no_of_cards_from_ip': 1,
    'User_id': 123,
    'TimeStamp': '2024-06-24 14:00:00'
}
prediction = fraud_predictor.predict(sample_transaction)
print(prediction)

{'decision': 'NEED TO TAKE FEEDBACK', 'probability': 0.8628, 'anomaly_score': -0.0155, 'fraud_indicators': [{'feature': 'account_age_days', 'value': -1.3613577010453148, 'impact_percent': np.float32(40.81)}, {'feature': 'amount', 'value': 3.1743679757443357, 'impact_percent': np.float32(10.26)}, {'feature': 'num_trans_24h', 'value': -1.0290281810780284, 'impact_percent': np.float32(10.02)}, {'feature': 'total_value', 'value': 1.0461800555450884, 'impact_percent': np.float32(8.12)}, {'feature': 'amount_to_avg', 'value': 57.23007937010643, 'impact_percent': np.float32(7.87)}], 'fraud_pattern': 'account_age_days', 'thresholds': {'xgb_high': 0.9, 'xgb_feedback': 0.6, 'iso_threshold': 0.0}}


In [12]:
fraud_predictor = FraudPredictor('hybrid_model.pkl')
sample_transaction = {
    'account_age_days': 2,
    'payment_method': 'Credit Card',
    'device': 'Laptop',
    'category': 'Electronics',
    'amount': 35000.0,
    'quantity': 3,
    'total_value': 35000.0,
    'num_trans_24h': 3,
    'num_failed_24h': 5,
    'no_of_cards_from_ip': 1,
    'User_id': 123,
    'TimeStamp': '2024-06-24 14:00:00'
}
prediction = fraud_predictor.predict(sample_transaction)
print(prediction)

{'decision': 'FRAUD', 'probability': 0.9948, 'anomaly_score': -0.0186, 'fraud_indicators': [{'feature': 'num_failed_24h', 'value': 4.546754047725716, 'impact_percent': np.float32(39.57)}, {'feature': 'account_age_days', 'value': -1.3613577010453148, 'impact_percent': np.float32(23.33)}, {'feature': 'num_trans_24h', 'value': -0.036777078371046684, 'impact_percent': np.float32(7.03)}, {'feature': 'total_value', 'value': 1.0461800555450884, 'impact_percent': np.float32(6.64)}, {'feature': 'amount', 'value': 3.1743679757443357, 'impact_percent': np.float32(4.54)}], 'fraud_pattern': 'num_failed_24h', 'thresholds': {'xgb_high': 0.9, 'xgb_feedback': 0.6, 'iso_threshold': 0.0}}


In [13]:
import joblib
model = joblib.load("hybrid_model.pkl")
print(model.keys())

dict_keys(['xgb_model', 'iso_model', 'pipeline', 'iso_thresh', 'feature_names'])


In [14]:
import joblib
model = joblib.load("hybrid_model.pkl")
print(model.keys())
print(model.get("feature_names"))
# or, if that is None:
print(model["xgb_model"].get_booster().feature_names)

dict_keys(['xgb_model', 'iso_model', 'pipeline', 'iso_thresh', 'feature_names'])
['account_age_days', 'amount', 'quantity', 'total_value', 'num_trans_24h', 'num_failed_24h', 'no_of_cards_from_ip', 'transaction_hour', 'amount_log', 'amount_to_avg', 'new_device_flag', 'hour_sin', 'hour_cos', 'payment_method_Credit Card', 'payment_method_Debit Card', 'payment_method_Net Banking', 'payment_method_Wallet', 'device_Desktop', 'device_Mobile', 'device_Tablet', 'category_Beauty', 'category_Clothing', 'category_Electronics', 'category_Groceries', 'category_Home']
None
