In [2]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.4-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.4-py3-none-win_amd64.whl (56.8 MB)
   ---------------------------------------- 0.0/56.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/56.8 MB ? eta -:--:--
   ---------------------------------------- 0.5/56.8 MB 1.9 MB/s eta 0:00:31
    --------------------------------------- 1.0/56.8 MB 2.2 MB/s eta 0:00:26
   - -------------------------------------- 1.6/56.8 MB 2.4 MB/s eta 0:00:24
   - -------------------------------------- 2.4/56.8 MB 2.8 MB/s eta 0:00:20
   -- ------------------------------------- 3.4/56.8 MB 3.1 MB/s eta 0:00:17
   --- ------------------------------------ 4.5/56.8 MB 3.6 MB/s eta 0:00:15
   ---- ----------------------------------- 5.8/56.8 MB 3.9 MB/s eta 0:00:14
   ----- ---------------------------------- 7.3/56.8 MB 4.4 MB/s eta 0:00:12
   ------ --------------------------------- 9.2/56.8 MB 4.9 MB/s eta 0:00:10
   ------- ---------


[notice] A new release of pip is available: 24.3 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
pip install lightgbm

Collecting lightgbmNote: you may need to restart the kernel to use updated packages.

  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------- -------------------------------- 0.3/1.5 MB ? eta -:--:--
   ------------------------------------ --- 1.3/1.5 MB 4.5 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 4.0 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0



[notice] A new release of pip is available: 24.3 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
pip install networkx

Collecting networkxNote: you may need to restart the kernel to use updated packages.

  Downloading networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Downloading networkx-3.5-py3-none-any.whl (2.0 MB)
   ---------------------------------------- 0.0/2.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.0 MB ? eta -:--:--
   ----- ---------------------------------- 0.3/2.0 MB ? eta -:--:--
   --------------- ------------------------ 0.8/2.0 MB 1.5 MB/s eta 0:00:01
   ------------------------- -------------- 1.3/2.0 MB 2.1 MB/s eta 0:00:01
   ------------------------------------ --- 1.8/2.0 MB 2.3 MB/s eta 0:00:01
   ---------------------------------------- 2.0/2.0 MB 2.2 MB/s eta 0:00:00
Installing collected packages: networkx
Successfully installed networkx-3.5



[notice] A new release of pip is available: 24.3 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
import networkx as nx
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.metrics import precision_recall_curve, average_precision_score
import xgboost as xgb
import lightgbm as lgb

# Importing Data

In [2]:
df = pd.read_csv('red_flag_transactions.csv')

In [3]:
df.shape

(5000, 7)

In [4]:
df.head()

Unnamed: 0,userID,merchantID,transactionID,datetime,day_of_week,amount,is_fraud
0,USER_0368,MERCH_069,TXN_00004398,2024-02-13 01:33:00,Tuesday,16829.67,1
1,USER_0355,MERCH_030,TXN_00004341,2024-02-13 02:31:00,Tuesday,17606.69,1
2,USER_0299,MERCH_030,TXN_00002393,2024-02-13 04:27:00,Tuesday,7542.58,0
3,USER_0189,MERCH_046,TXN_00003813,2024-02-13 04:46:00,Tuesday,7982.51,0
4,USER_0083,MERCH_070,TXN_00000021,2024-02-13 08:38:00,Tuesday,1136.34,0


# Feature Engineering

### Convert Date & time

In [5]:
df['datetime'] = pd.to_datetime(df['datetime'])
df['hours'] = df['datetime'].dt.hour
df['day'] = df['datetime'].dt.day
df['month'] = df['datetime'].dt.month
df['is_weekend'] = df['datetime'].dt.dayofweek.isin([5, 6]).astype(int)

#### Red Flag 1, Odd hours (before 6AM or after 10 PM)

In [6]:
df['odd_hours'] = ((df['hours'] <= 5) | (df['hours'] >= 22)).astype(int)

#### Red Flag 2, High amount (Above 90th percentile)

In [7]:
amount_90th = df['amount'].quantile(0.9)
df['is_high_amount'] = (df['amount'] >= amount_90th).astype(int)

#### Red Flag 3, User-Merchant transaction frequency

In [8]:
user_merchant_counts = df.groupby(['userID', 'merchantID']).size().reset_index(name='transaction_count')
df = df.merge(user_merchant_counts, on=['userID', 'merchantID'], how='left')

#### Red Flag 4, Amount deviation from user's average

In [9]:
user_avg_amount = df.groupby('userID')['amount'].mean().reset_index()
user_avg_amount.columns = ['userID', 'user_avg_amount']
df = df.merge(user_avg_amount, on='userID', how='left')
df['amount_deviation_ratio'] = df['amount'] / df['user_avg_amount']

#### Red Flag 5, Merchant transaction frequency for user

In [10]:
merchant_user_counts = df.groupby(['merchantID','userID']).size().reset_index(name = 'merchant_user_freq')
df = df.merge(merchant_user_counts , on = ['merchantID','userID'], how = 'left')

#### Red Flag 6, Time since last transaction for user

In [11]:
df_sorted = df.sort_values(['userID', 'datetime'])
df_sorted['time_since_last'] = df_sorted.groupby('userID')['datetime'].diff().dt.total_seconds() / 3600  
df_sorted['time_since_last'] = df_sorted['time_since_last'].fillna(24)  

#### Red Flag 7, Weekend high amount

In [12]:
df_sorted['weekend_high_amount'] = ((df_sorted['is_weekend'] == 1) & (df_sorted['amount'] >= amount_90th)).astype(int)

## Encoding Variables

In [13]:
le_user = LabelEncoder()
le_merchant = LabelEncoder()
df_sorted['user_encoded'] = le_user.fit_transform(df_sorted['userID'])
df_sorted['merchant_encoded'] = le_merchant.fit_transform(df_sorted['merchantID'])

## Preparing features

In [14]:
feature_columns = [
    'user_encoded', 'merchant_encoded', 'amount', 'hours', 'day', 'month',
    'is_weekend', 'odd_hours', 'is_high_amount', 'transaction_count',
    'amount_deviation_ratio', 'merchant_user_freq', 'time_since_last',
    'weekend_high_amount'
]

In [15]:
X = df_sorted[feature_columns].copy()
Y = df_sorted['is_fraud'].copy()

In [16]:
X

Unnamed: 0,user_encoded,merchant_encoded,amount,hours,day,month,is_weekend,odd_hours,is_high_amount,transaction_count,amount_deviation_ratio,merchant_user_freq,time_since_last,weekend_high_amount
164,0,99,4741.82,15,19,2,0,0,0,2,0.492412,2,24.000000,0
370,0,26,4632.98,9,28,2,0,0,0,1,0.481109,1,209.550000,0
1852,0,21,6661.20,13,22,4,0,0,0,1,0.691729,1,1300.133333,0
2004,0,9,3414.86,14,27,4,1,0,0,1,0.354614,1,121.266667,0
2481,0,6,13774.05,7,15,5,0,0,0,13,1.430358,13,425.416667,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4855,498,59,4467.26,19,6,8,0,0,0,2,1.007131,2,1113.616667,0
2038,499,64,7787.24,20,28,4,1,0,0,1,0.900899,1,24.000000,0
2215,499,35,17273.47,0,5,5,1,1,1,1,1.998352,1,148.183333,1
3159,499,3,1762.64,12,6,6,0,0,0,1,0.203918,1,779.700000,0


In [17]:
Y

164     0
370     0
1852    0
2004    0
2481    1
       ..
4855    0
2038    0
2215    1
3159    0
3865    0
Name: is_fraud, Length: 5000, dtype: int64

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

## Training XGBoost Model 

In [20]:
fraud_count = len(Y_train[Y_train == 1])
legitimate_count = len(Y_train[Y_train == 0])
scale_pos_weight = legitimate_count / fraud_count

print(f"Class balance:")
print(f"   Legitimate transactions: {legitimate_count}")
print(f"   Fraud transactions: {fraud_count}")
print(f"   Scale weight: {scale_pos_weight:.2f}")

# Create and train XGBoost model
model = xgb.XGBClassifier(
    n_estimators=100,          # Number of trees
    max_depth=6,               # How deep each tree can be
    learning_rate=0.1,         # How fast the model learns
    scale_pos_weight=scale_pos_weight,  # Balance classes
    random_state=42,           # For reproducible results
    eval_metric='auc'          # Metric to optimize
)


Class balance:
   Legitimate transactions: 3330
   Fraud transactions: 670
   Scale weight: 4.97


In [21]:
model.fit(X_train, Y_train)

## Testing the model

In [24]:
y_pred = model.predict(X_test)                    # Binary predictions (0 or 1)
y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probability scores (0 to 1)

# Calculate performance metrics
roc_auc = roc_auc_score(Y_test, y_pred_proba)
avg_precision = average_precision_score(Y_test, y_pred_proba)

print(f" MODEL PERFORMANCE:")
print(f"   ROC AUC Score: {roc_auc:.4f} (higher is better, max = 1.0)")
print(f"   Average Precision: {avg_precision:.4f} (higher is better, max = 1.0)")

# Detailed classification report
print(f"\n DETAILED RESULTS:")
print(classification_report(Y_test, y_pred, target_names=['Legitimate', 'Fraud']))


 MODEL PERFORMANCE:
   ROC AUC Score: 0.9994 (higher is better, max = 1.0)
   Average Precision: 0.9980 (higher is better, max = 1.0)

 DETAILED RESULTS:
              precision    recall  f1-score   support

  Legitimate       1.00      1.00      1.00       820
       Fraud       1.00      0.99      1.00       180

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000



In [26]:
# Confusion Matrix
cm = confusion_matrix(Y_test, y_pred)
print(f"\n CONFUSION MATRIX:")
print(f"                 Predicted")
print(f"Actual    Legitimate  Fraud")
print(f"Legitimate    {cm[0,0]:4d}     {cm[0,1]:4d}")
print(f"Fraud         {cm[1,0]:4d}     {cm[1,1]:4d}")

# Calculate specific metrics
true_positives = cm[1,1]  # Correctly identified fraud
false_positives = cm[0,1] # Wrongly flagged legitimate as fraud
false_negatives = cm[1,0] # Missed fraud transactions
true_negatives = cm[0,0]  # Correctly identified legitimate

print(f"\n BUSINESS METRICS:")
print(f"   Fraud correctly caught: {true_positives}")
print(f"   Fraud missed: {false_negatives}")
print(f"   False alarms: {false_positives}")
print(f"   Legitimate correctly identified: {true_negatives}")

catch_rate = true_positives / (true_positives + false_negatives) * 100
false_alarm_rate = false_positives / (false_positives + true_negatives) * 100

print(f"   Fraud catch rate: {catch_rate:.1f}%")
print(f"   False alarm rate: {false_alarm_rate:.1f}%")


 CONFUSION MATRIX:
                 Predicted
Actual    Legitimate  Fraud
Legitimate     820        0
Fraud            1      179

 BUSINESS METRICS:
   Fraud correctly caught: 179
   Fraud missed: 1
   False alarms: 0
   Legitimate correctly identified: 820
   Fraud catch rate: 99.4%
   False alarm rate: 0.0%


## Fraud detection function

In [27]:
def detect_fraud(new_transaction):
    """
    Simple function to detect fraud in a new transaction
    
    Input: dictionary with transaction details
    Output: fraud probability and prediction
    """
    
    # Convert to the format our model expects
    transaction_df = pd.DataFrame([new_transaction])
    
    # Make prediction
    fraud_probability = model.predict_proba(transaction_df)[0, 1]
    is_fraud = model.predict(transaction_df)[0]
    
    return {
        'fraud_probability': fraud_probability,
        'is_fraud': bool(is_fraud),
        'confidence': 'High' if fraud_probability > 0.8 or fraud_probability < 0.2 else 'Medium'
    }


## Testing with real test set

In [29]:
# Test on some actual transactions from our test set
test_indices = [0, 1, 2, 3, 4]  # First 5 test transactions

print("EXAMPLE PREDICTIONS:")
for i, idx in enumerate(test_indices):
    # Get the actual transaction data
    transaction_data = X_test.iloc[idx].to_dict()
    actual_label = Y_test.iloc[idx]
    
    # Make prediction
    result = detect_fraud(transaction_data)
    
    print(f"\nTransaction {i+1}:")
    print(f"   Actual: {'FRAUD' if actual_label == 1 else 'LEGITIMATE'}")
    print(f"   Predicted: {'FRAUD' if result['is_fraud'] else 'LEGITIMATE'}")
    print(f"   Fraud Probability: {result['fraud_probability']:.3f}")
    print(f"   Confidence: {result['confidence']}")
    print(f"   Amount: ${transaction_data['amount']:.2f}")
    print(f"   Hour: {transaction_data['hours']}:00")
    print(f"   Weekend: {'Yes' if transaction_data['is_weekend'] else 'No'}")

EXAMPLE PREDICTIONS:

Transaction 1:
   Actual: LEGITIMATE
   Predicted: LEGITIMATE
   Fraud Probability: 0.000
   Confidence: High
   Amount: $4493.67
   Hour: 9.0:00
   Weekend: No

Transaction 2:
   Actual: LEGITIMATE
   Predicted: LEGITIMATE
   Fraud Probability: 0.000
   Confidence: High
   Amount: $4042.83
   Hour: 14.0:00
   Weekend: No

Transaction 3:
   Actual: LEGITIMATE
   Predicted: LEGITIMATE
   Fraud Probability: 0.000
   Confidence: High
   Amount: $4252.50
   Hour: 12.0:00
   Weekend: No

Transaction 4:
   Actual: LEGITIMATE
   Predicted: LEGITIMATE
   Fraud Probability: 0.001
   Confidence: High
   Amount: $12135.16
   Hour: 17.0:00
   Weekend: No

Transaction 5:
   Actual: FRAUD
   Predicted: FRAUD
   Fraud Probability: 1.000
   Confidence: High
   Amount: $17477.33
   Hour: 1.0:00
   Weekend: Yes
