## Step 1: Import Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
import time

## Step 2: Load and Explore the Dataset

In [6]:
# Load the dataset
data = pd.read_csv(r'C:\Users\prapu\OneDrive\Desktop\transactionsSmall.csv')


# Display the first few rows to understand the structure
print(data.head())

# Check for missing values
print(data.isnull().sum())

   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0        0               0  
1  M2044282225             0.0             0.0        0               0  
2   C553264065             0.0             0.0        1               0  
3    C38997010         21182.0             0.0        1               0  
4  M1230701703             0.0             0.0        0               0  
step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newba

## Step 3: Preprocess the Data
1. Encoding categorical variables: Convert categorical variables (type) to numerical values.
2. Scaling: Use StandardScaler to scale numerical columns for better performance in machine learning algorithms.

In [7]:
# Print column names to confirm 'type' is present
print("Column names:", data.columns)

# Ensure the 'type' column exists before encoding
if 'type' in data.columns:
    # Encode categorical 'type' feature
    data['type'] = data['type'].astype('category').cat.codes
else:
    print("Error: 'type' column not found in the dataset.")
    # Optionally, you can stop the code here or handle the absence of this column as needed.
    exit()

# Define features and target variable
X = data.drop(['isFraud', 'nameOrig', 'nameDest', 'isFlaggedFraud'], axis=1, errors='ignore')  # Drop irrelevant columns
y = data['isFraud']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


Column names: Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')


## Step 4: Train the Model
Using a Random Forest classifier (effective for tabular data and often used in fraud detection):

In [8]:
# Define and train the Random Forest model

print("started")
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Evaluate model performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("done")

started
Accuracy: 0.9937106918238994

Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00       409
           1       1.00      0.96      0.98        68

    accuracy                           0.99       477
   macro avg       1.00      0.98      0.99       477
weighted avg       0.99      0.99      0.99       477

done


## Step 5: Real-Time Fraud Detection Simulation

With the trained model, we can simulate real-time fraud detection by running new transactions (one by one or in small batches) through the model. Here’s how to do it:

In [9]:
# Ensure 'X_test' has been created and is a scaled NumPy array from the StandardScaler
# Convert `X_test` back to a DataFrame with original column names for compatibility
X_test_df = pd.DataFrame(X_test, columns=X.columns)

# Now proceed with random sampling and real-time fraud detection
sampled_transactions = X_test_df.sample(n=100, random_state=42)

def simulate_real_time_detection(new_data):
    """Simulate real-time fraud detection on new transactions."""
    for index, transaction in new_data.iterrows():
        # Preprocess the single transaction (already scaled here)
        transaction_processed = np.array([transaction.values])  # Reshape for model compatibility

        # Predict if it's fraud
        prediction = model.predict(transaction_processed)
        
        if prediction[0] == 1:
            print(f"🚨 Fraud Detected in Transaction ID {index}")
        else:
            print(f"✅ Transaction ID {index} is Safe.")
        
        # Pause to simulate real-time detection
        time.sleep(1)

# Simulate with 100 random transactions
simulate_real_time_detection(sampled_transactions)

✅ Transaction ID 468 is Safe.
🚨 Fraud Detected in Transaction ID 33
✅ Transaction ID 131 is Safe.
✅ Transaction ID 72 is Safe.
✅ Transaction ID 78 is Safe.
✅ Transaction ID 113 is Safe.
🚨 Fraud Detected in Transaction ID 274
✅ Transaction ID 185 is Safe.
✅ Transaction ID 261 is Safe.
✅ Transaction ID 9 is Safe.
✅ Transaction ID 321 is Safe.
✅ Transaction ID 318 is Safe.
🚨 Fraud Detected in Transaction ID 281
✅ Transaction ID 462 is Safe.
✅ Transaction ID 82 is Safe.
✅ Transaction ID 439 is Safe.
✅ Transaction ID 30 is Safe.
✅ Transaction ID 101 is Safe.
✅ Transaction ID 411 is Safe.
✅ Transaction ID 180 is Safe.
✅ Transaction ID 355 is Safe.
✅ Transaction ID 249 is Safe.
🚨 Fraud Detected in Transaction ID 172
✅ Transaction ID 375 is Safe.
🚨 Fraud Detected in Transaction ID 0
✅ Transaction ID 346 is Safe.
✅ Transaction ID 11 is Safe.
✅ Transaction ID 93 is Safe.
✅ Transaction ID 447 is Safe.
✅ Transaction ID 195 is Safe.
✅ Transaction ID 265 is Safe.
✅ Transaction ID 182 is Safe.
✅ Tran