In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Step 1: Load your data (assuming you have a dataset)
data = pd.read_csv("fraud_data.csv")
print(data.head())

# Step 2: Feature Engineering

# Feature 1: Transaction Amount
transaction_amount = data["amt"]

# Feature 2: Transaction Location (Assuming it's a categorical feature)
transaction_location = pd.get_dummies(data["street"])

# Feature 3: User Behavior Patterns
# You can calculate various statistics based on historical user behavior data.
# For example, the number of previous transactions, average transaction amount, etc.
user_behavior = data.groupby("cc_num").agg({
    "trans_num": "count",
    "amt": "mean"
}).reset_index()
user_behavior.columns = ["cc_num", "trans_num", "amt"]

# Merge all extracted features into a single dataframe
features = pd.concat([transaction_amount, transaction_location, user_behavior], axis=1)

# Step 3: Split data into features (X) and target variable (y)
X = features
y = data["is_fraud"]



   Unnamed: 0 trans_date_trans_time            cc_num  \
0           0   2019-01-01 00:00:18  2703186189652095   
1           1   2019-01-01 00:00:44      630423337322   
2           2   2019-01-01 00:00:51    38859492057661   
3           3   2019-01-01 00:01:16  3534093764340240   
4           4   2019-01-01 00:03:06   375534208663984   

                             merchant       category     amt      first  \
0          fraud_Rippin, Kub and Mann       misc_net    4.97   Jennifer   
1     fraud_Heller, Gutmann and Zieme    grocery_pos  107.23  Stephanie   
2                fraud_Lind-Buckridge  entertainment  220.11     Edward   
3  fraud_Kutch, Hermiston and Farrell  gas_transport   45.00     Jeremy   
4                 fraud_Keeling-Crist       misc_pos   41.96      Tyler   

      last gender                        street  ...      lat      long  \
0    Banks      F                561 Perry Cove  ...  36.0788  -81.1781   
1     Gill      F  43039 Riley Greens Suite 393  ...  48

In [2]:
import numpy as np

# Check for missing values in X_train and X_test
missing_values_X = np.isnan(X).sum()
missing_values_y = np.isnan(y).sum()

print("Missing values in X:", missing_values_X)
print("Missing values in y:", missing_values_y)

Missing values in X: amt                               0
000 Jennifer Mills                0
0005 Morrison Land                0
00315 Ashley Valleys              0
0043 Henry Plaza                  0
                               ... 
997 Cameron Meadow Apt. 980       0
99736 Rose Shoals Apt. 504        0
cc_num                         6918
trans_num                      6918
amt                            6918
Length: 901, dtype: int64
Missing values in y: 1


In [3]:
X = pd.DataFrame(X)
y = pd.DataFrame(y)

X.fillna(X.mean(), inplace=True)
y.fillna(y.mean(), inplace=True)

In [4]:
import numpy as np

# Check for missing values in X_train and X_test
missing_values_X = np.isnan(X).sum()
missing_values_y = np.isnan(y).sum()

print("Missing values in X:", missing_values_X)
print("Missing values in y:", missing_values_y)

Missing values in X: amt                            0
000 Jennifer Mills             0
0005 Morrison Land             0
00315 Ashley Valleys           0
0043 Henry Plaza               0
                              ..
997 Cameron Meadow Apt. 980    0
99736 Rose Shoals Apt. 504     0
cc_num                         0
trans_num                      0
amt                            0
Length: 901, dtype: int64
Missing values in y: is_fraud    0
dtype: int64


In [5]:
# Step 4: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
X_train

array([[-4.35365130e-01, -3.79686003e-02, -3.09937864e-02, ...,
        -3.16845050e-03,  5.55002444e-03,  7.26991679e-03],
       [-2.13874554e-01, -3.79686003e-02, -3.09937864e-02, ...,
        -3.16845050e-03,  5.55002444e-03,  7.26991679e-03],
       [-4.66706246e-01, -3.79686003e-02, -3.09937864e-02, ...,
        -3.16845050e-03,  5.55002444e-03,  7.26991679e-03],
       ...,
       [ 7.23008823e-02, -3.79686003e-02, -3.09937864e-02, ...,
         9.30619644e+00,  7.25879661e+00, -4.86490350e-01],
       [-6.07490943e-01, -3.79686003e-02, -3.09937864e-02, ...,
        -3.16845050e-03,  5.55002444e-03,  7.26991679e-03],
       [-2.01203660e-02, -3.79686003e-02, -3.09937864e-02, ...,
        -3.16845050e-03,  5.55002444e-03,  7.26991679e-03]])

In [7]:
X_test

array([[ 0.42055776, -0.0379686 , -0.03099379, ..., -0.00316845,
         0.00555002,  0.00726992],
       [-0.33953941, -0.0379686 , -0.03099379, ..., -0.00316845,
         0.00555002,  0.00726992],
       [-0.57054247, -0.0379686 , -0.03099379, ..., -0.00316845,
         0.00555002,  0.00726992],
       ...,
       [ 0.09292794, -0.0379686 , -0.03099379, ..., -0.00316845,
         0.00555002,  0.00726992],
       [-0.3675762 , -0.0379686 , -0.03099379, ..., -0.00316845,
         0.00555002,  0.00726992],
       [-0.62281104, -0.0379686 , -0.03099379, ..., -0.00316845,
         0.00555002,  0.00726992]])

In [8]:
threshold = 0.5  # Set your threshold here
y_train_binary = (y_train > threshold).astype(int)

# Step 6: Train a fraud detection model (Random Forest Classifier)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train_binary)  # Use y_train_binary instead of y_train

# Step 7: Evaluate the model
y_pred = model.predict(X_test)

  model.fit(X_train, y_train_binary)  # Use y_train_binary instead of y_train


In [9]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Confusion Matrix:\n{conf_matrix}")


Accuracy: 0.9968010236724248
Precision: 0.8333333333333334
Recall: 0.7692307692307693
F1 Score: 0.8
Confusion Matrix:
[[1548    2]
 [   3   10]]


In [26]:
# Step 6: Save the trained model
import joblib
joblib.dump(model, "fraud_detection_model.pkl")

# Step 7: Continuously update the model with new data (assuming you have new data)
# Load the saved model
loaded_model = joblib.load("fraud_detection_model.pkl")

# Load and preprocess new data (replace 'new_data.csv' with your new data file)
new_data = pd.read_csv("new data.csv")
# new_data = new_data.drop(['trans_date_trans_time'], axis=1)
X_new = new_data.drop("is_fraud", axis=1)
y_new = new_data["is_fraud"]


In [27]:
# Separate numeric and non-numeric columns
numeric_cols = new_data.select_dtypes(include=['float64', 'int64']).columns
non_numeric_cols = new_data.select_dtypes(exclude=['float64', 'int64']).columns

# # Create a StandardScaler instance
scaler = StandardScaler()

# Scale only the numeric columns
X_new_numeric = scaler.fit_transform(new_data[numeric_cols])

# # Combine the scaled numeric columns and non-numeric columns back together
X_new = pd.concat([pd.DataFrame(X_new_numeric, columns=numeric_cols), new_data[non_numeric_cols]], axis=1)
# X_new
X_new_numeric = pd.DataFrame(X_new_numeric)
y_new = pd.DataFrame(y_new)

X_new_numeric.fillna(X_new_numeric.mean(), inplace=True)
y_new.fillna(y_new.mean(), inplace=True)
# Convert y_new to binary labels based on a threshold
threshold = 0.5  # Adjust the threshold as needed
y_new_binary = (y_new > threshold).astype(int)

# # Update the model with new data
loaded_model.fit(X_new_numeric, y_new_binary)

# Save the updated model
joblib.dump(loaded_model, "fraud_detection_model.pkl")

  loaded_model.fit(X_new_numeric, y_new_binary)


['fraud_detection_model.pkl']

In [31]:
# Assuming you have a new transaction stored in a variable called 'new_transaction'
prediction = loaded_model.predict(X_new_numeric)
prediction

array([0, 0, 0, ..., 0, 0, 0])

In [41]:

import numpy as np

# Create a NumPy array with values for all 11 features
new_data_array = np.array([
    [100.0, 0, 1, 0, 0, 0, 0, 0, 0, 10, 75.5]
])

# 100.0: This  represent the "transaction_amount" feature.
# 0, 1, 0, 0, 0, 0, 0, 0, 0: These binary values may correspond to one-hot encoded location features (e.g., "transaction_location_A," "transaction_location_B," and so on).
# 10: This may represent the "user_behavior_trans_num" feature.
# 75.5: This might correspond to the "user_behavior_amt" feature.


# # Make predictions on the new data

predictions = loaded_model.predict(new_data_array)

predictions


array([1])