In [2]:
!pip install faker

Collecting faker
  Downloading Faker-30.8.2-py3-none-any.whl.metadata (15 kB)
Downloading Faker-30.8.2-py3-none-any.whl (1.8 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.8 MB[0m [31m4.4 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━[0m [32m1.0/1.8 MB[0m [31m14.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.8/1.8 MB[0m [31m21.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-30.8.2


In [3]:
import pandas as pd
from faker import Faker
import random
import numpy as np

# Initialize Faker
fake = Faker()

# Constants
num_records = 50000  # Total number of transactions to generate
banks = ['Bank A', 'Bank B', 'Bank C', 'Bank D']
threshold_age = 30  # Threshold age for account legitimacy

# Function to generate a single transaction
def generate_transaction():
    transaction_id = fake.uuid4()
    sender_upi_id = fake.email()  # Using email as a proxy for UPI ID
    receiver_upi_id = fake.email()  # Using email as a proxy for UPI ID
    sender_bank = random.choice(banks)
    receiver_bank = random.choice(banks)
    transaction_amount = round(random.uniform(1, 10000), 2)  # Amount between 1 and 10000
    transaction_date = fake.date_time_this_year()

    # Randomly generate account age and previous fraud reports
    account_age = random.randint(1, 365)  # Account age in days
    previous_fraud_reports = random.randint(0, 2)  # Randomly assign some fraud reports

    # Randomly determine if the sender/receiver is marked as spam
    is_sender_spam = random.choice([0, 1])  # 0 = not spam, 1 = spam
    is_receiver_spam = random.choice([0, 1])  # 0 = not spam, 1 = spam

    # Complaints count could be between 0 and 5
    complaints_count = random.randint(0, 5)

    # Determine if the transaction is flagged as fraudulent or legitimate
    is_fraud = (account_age < threshold_age) or (previous_fraud_reports > 0)

    transaction_status = 'fraudulent' if is_fraud else 'legitimate'

    return {
        'transaction_id': transaction_id,
        'sender_upi_id': sender_upi_id,
        'receiver_upi_id': receiver_upi_id,
        'sender_bank': sender_bank,
        'receiver_bank': receiver_bank,
        'transaction_amount': transaction_amount,
        'transaction_date': transaction_date,
        'account_age': account_age,
        'previous_fraud_reports': previous_fraud_reports,
        'is_sender_spam': is_sender_spam,
        'is_receiver_spam': is_receiver_spam,
        'complaints_count': complaints_count,
        'transaction_status': transaction_status
    }

# Generate the dataset
data = [generate_transaction() for _ in range(num_records)]
df = pd.DataFrame(data)

# Ensure the dataset is balanced
fraudulent_count = df['transaction_status'].value_counts().get('fraudulent', 0)
legitimate_count = df['transaction_status'].value_counts().get('legitimate', 0)

# Balancing the dataset by downsampling the legitimate transactions if needed
if legitimate_count > fraudulent_count:
    df_legitimate = df[df['transaction_status'] == 'legitimate']
    df_fraudulent = df[df['transaction_status'] == 'fraudulent']
    df_legitimate_balanced = df_legitimate.sample(fraudulent_count, random_state=1)
    df_balanced = pd.concat([df_fraudulent, df_legitimate_balanced])
else:
    df_balanced = df

# Save the balanced dataset to a CSV file
df_balanced.to_csv('balanced_transaction_dataset.csv', index=False)

# Display the first few rows of the balanced dataset
print(df_balanced.head())
print("\nDataset saved as 'balanced_transaction_dataset.csv'")


                         transaction_id                sender_upi_id  \
0  592fed66-1ca3-43b4-bed5-2a65e21602c2         joshua52@example.net   
1  b49900ab-52ca-4f3b-bbb4-8bc2c4449668           gjones@example.com   
2  3e234310-4f46-4558-b68d-635193adfb13         joshua12@example.net   
3  33756e01-2775-468b-9e53-e0cb6193f337    rodgerskendra@example.org   
4  ed91435c-81b5-4e90-9e20-240e5fb02db2  garrettmeredith@example.com   

                receiver_upi_id sender_bank receiver_bank  transaction_amount  \
0  margaretespinoza@example.org      Bank A        Bank A             6731.10   
1   phillipsaunders@example.net      Bank A        Bank C             8731.45   
2           coliver@example.com      Bank B        Bank B             9578.72   
3            mmoody@example.com      Bank B        Bank B             8207.48   
4   williamsgregory@example.net      Bank D        Bank A             1496.36   

            transaction_date  account_age  previous_fraud_reports  \
0 2024-09-1

In [4]:
import pandas as pd

# Load the dataset
df = pd.read_csv('balanced_transaction_dataset.csv')

# Display the first few rows and the shape of the dataset
print(df.head())
print("Shape of the dataset:", df.shape)


                         transaction_id                sender_upi_id  \
0  592fed66-1ca3-43b4-bed5-2a65e21602c2         joshua52@example.net   
1  b49900ab-52ca-4f3b-bbb4-8bc2c4449668           gjones@example.com   
2  3e234310-4f46-4558-b68d-635193adfb13         joshua12@example.net   
3  33756e01-2775-468b-9e53-e0cb6193f337    rodgerskendra@example.org   
4  ed91435c-81b5-4e90-9e20-240e5fb02db2  garrettmeredith@example.com   

                receiver_upi_id sender_bank receiver_bank  transaction_amount  \
0  margaretespinoza@example.org      Bank A        Bank A             6731.10   
1   phillipsaunders@example.net      Bank A        Bank C             8731.45   
2           coliver@example.com      Bank B        Bank B             9578.72   
3            mmoody@example.com      Bank B        Bank B             8207.48   
4   williamsgregory@example.net      Bank D        Bank A             1496.36   

             transaction_date  account_age  previous_fraud_reports  \
0  2024-09

In [5]:
# Step 2: Drop 'transaction_id' if it exists
if 'transaction_id' in df.columns:
    df.drop('transaction_id', axis=1, inplace=True)

# Display the updated DataFrame and its shape
print("Updated DataFrame after dropping 'transaction_id':")
print(df.head())
print("Shape of the dataset after dropping 'transaction_id':", df.shape)

Updated DataFrame after dropping 'transaction_id':
                 sender_upi_id               receiver_upi_id sender_bank  \
0         joshua52@example.net  margaretespinoza@example.org      Bank A   
1           gjones@example.com   phillipsaunders@example.net      Bank A   
2         joshua12@example.net           coliver@example.com      Bank B   
3    rodgerskendra@example.org            mmoody@example.com      Bank B   
4  garrettmeredith@example.com   williamsgregory@example.net      Bank D   

  receiver_bank  transaction_amount            transaction_date  account_age  \
0        Bank A             6731.10  2024-09-17 15:03:22.173623          127   
1        Bank C             8731.45  2024-02-08 01:26:14.859978           69   
2        Bank B             9578.72  2024-01-22 15:43:53.647748          256   
3        Bank B             8207.48  2024-10-22 20:40:45.441714           27   
4        Bank A             1496.36  2024-07-25 13:28:41.291942          219   

   previous

In [6]:
# Step 3: Convert 'transaction_date' to datetime
df['transaction_date'] = pd.to_datetime(df['transaction_date'])

# Display the updated DataFrame and its shape
print("Updated DataFrame after converting 'transaction_date' to datetime:")
print(df[['transaction_date']].head())
print("Shape of the dataset:", df.shape)


Updated DataFrame after converting 'transaction_date' to datetime:
            transaction_date
0 2024-09-17 15:03:22.173623
1 2024-02-08 01:26:14.859978
2 2024-01-22 15:43:53.647748
3 2024-10-22 20:40:45.441714
4 2024-07-25 13:28:41.291942
Shape of the dataset: (50000, 12)


In [7]:
# Step 1: Limit unique UPI IDs and group rare values
def limit_unique_values(series, threshold=10):
    # Find counts of each unique value
    counts = series.value_counts()
    # Keep only those that meet the threshold, others will be grouped as 'Other'
    return series.where(series.isin(counts[counts >= threshold].index), other='Other')

# Apply this to both sender and receiver UPI IDs
df['sender_upi_id'] = limit_unique_values(df['sender_upi_id'])
df['receiver_upi_id'] = limit_unique_values(df['receiver_upi_id'])

# Step 2: One-hot encode categorical features again
df_encoded = pd.get_dummies(df, columns=['sender_upi_id', 'receiver_upi_id', 'sender_bank', 'receiver_bank'], drop_first=True)

# Display the shape of the DataFrame after one-hot encoding
print("Shape of the dataset after limiting unique values and one-hot encoding:", df_encoded.shape)

Shape of the dataset after limiting unique values and one-hot encoding: (50000, 15)


In [8]:
# Step 3: Prepare features (X) and target variable (y)
X = df_encoded.drop('transaction_status', axis=1)  # Features
y = df_encoded['transaction_status'].map({'legitimate': 0, 'fraudulent': 1})  # Target variable (1 for fraudulent, 0 for legitimate)

# Display the shapes of the features and target variable
print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (50000, 14)
y shape: (50000,)


In [11]:
from sklearn.model_selection import train_test_split

# Step 4: Prepare features (X) and target variable (y)
X = df_encoded.drop(['transaction_status', 'transaction_date'], axis=1)  # Features
y = df_encoded['transaction_status'].map({'legitimate': 0, 'fraudulent': 1})  # Target variable (1 for fraudulent, 0 for legitimate)

# Continue with the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Print shapes to confirm
print("X shape:", X.shape)
print("y shape:", y.shape)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X shape: (50000, 13)
y shape: (50000,)
X_train shape: (40000, 13)
X_test shape: (10000, 13)
y_train shape: (40000,)
y_test shape: (10000,)


In [12]:
from sklearn.preprocessing import StandardScaler

# Step 5: Standardize the numerical features
scaler = StandardScaler()
numerical_features = ['transaction_amount', 'account_age', 'previous_fraud_reports', 'complaints_count']

# Fit the scaler on the training data and transform both train and test data
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

# Display the first few rows of the scaled training data
print(X_train.head())

       transaction_amount  account_age  previous_fraud_reports  \
27324            0.598436     1.449159               -1.216858   
45992            1.572505    -0.938783               -1.216858   
42351           -1.644517     0.937457               -1.216858   
12309           -0.999042    -0.891403               -1.216858   
4076            -0.369285    -1.696860                1.229641   

       is_sender_spam  is_receiver_spam  complaints_count  \
27324               1                 1         -0.873259   
45992               1                 1         -0.873259   
42351               1                 0         -0.287523   
12309               1                 0         -0.287523   
4076                1                 1         -0.873259   

       sender_upi_id_ksmith@example.org  sender_bank_Bank B  \
27324                             False                True   
45992                             False               False   
42351                             False        

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Step 6: Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Step 7: Fit the model on the training data
rf_classifier.fit(X_train, y_train)

# Step 8: Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

# Step 9: Evaluate the model
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Display the results
print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)


Confusion Matrix:
 [[3090    0]
 [   0 6910]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      3090
           1       1.00      1.00      1.00      6910

    accuracy                           1.00     10000
   macro avg       1.00      1.00      1.00     10000
weighted avg       1.00      1.00      1.00     10000



In [14]:
from sklearn.model_selection import cross_val_score

# Initialize Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(rf_model, X, y, cv=5)  # 5-fold cross-validation

# Display cross-validation scores and mean
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())

Cross-Validation Scores: [1. 1. 1. 1. 1.]
Mean CV Score: 1.0


In [15]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=5,
                           scoring='accuracy',
                           verbose=2,
                           n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Display best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Best CV Score: 1.0


In [16]:
# Train the Random Forest model with the best parameters
final_model = RandomForestClassifier(
    n_estimators=grid_search.best_params_['n_estimators'],
    max_depth=grid_search.best_params_['max_depth'],
    min_samples_split=grid_search.best_params_['min_samples_split'],
    random_state=42
)

# Fit the model on the training data
final_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = final_model.predict(X_test)

# Evaluate the model
from sklearn.metrics import confusion_matrix, classification_report

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification Report
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)

Confusion Matrix:
[[3090    0]
 [   0 6910]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3090
           1       1.00      1.00      1.00      6910

    accuracy                           1.00     10000
   macro avg       1.00      1.00      1.00     10000
weighted avg       1.00      1.00      1.00     10000



In [17]:
feature_importances = final_model.feature_importances_
feature_names = X_train.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
feature_importance_df.sort_values(by='Importance', ascending=False, inplace=True)

print(feature_importance_df)

                             Feature  Importance
2             previous_fraud_reports    0.902155
1                        account_age    0.086690
0                 transaction_amount    0.007504
5                   complaints_count    0.001362
4                   is_receiver_spam    0.000369
3                     is_sender_spam    0.000356
8                 sender_bank_Bank C    0.000278
7                 sender_bank_Bank B    0.000276
12              receiver_bank_Bank D    0.000263
9                 sender_bank_Bank D    0.000259
10              receiver_bank_Bank B    0.000240
11              receiver_bank_Bank C    0.000226
6   sender_upi_id_ksmith@example.org    0.000023


In [18]:
import joblib
joblib.dump(final_model, 'fraud_detection_model.pkl')

['fraud_detection_model.pkl']