In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')


In [2]:
# Load the dataset
print("Loading dataset...")
df = pd.read_csv('creditcard_2023.csv', nrows=100000)
print(f"Total transactions: {len(df)}")
print(f"Fraudulent transactions: {df[df.Class == 1].shape[0]}")
print(f"Fraud ratio: {df[df.Class == 1].shape[0] / len(df) * 100:.2f}%")


Loading dataset...
Total transactions: 100000
Fraudulent transactions: 223
Fraud ratio: 0.22%


In [3]:
# Let's extract features from the ID
print("\nExtracting features from ID...")
df['id_length'] = df['id'].astype(str).apply(len)
df['id_first_digit'] = df['id'].astype(str).str[0].astype(int)
df['id_last_digit'] = df['id'].astype(str).str[-1].astype(int)
df['id_sum_digits'] = df['id'].astype(str).apply(lambda x: sum(int(digit) for digit in x))
df['id_num_unique_digits'] = df['id'].astype(str).apply(lambda x: len(set(x)))



Extracting features from ID...


In [4]:
# Analyze ID patterns
print("\nID patterns analysis:")
print("Average ID length (fraud):", df[df.Class == 1]['id_length'].mean())
print("Average ID length (legitimate):", df[df.Class == 0]['id_length'].mean())
print("Average sum of digits (fraud):", df[df.Class == 1]['id_sum_digits'].mean())
print("Average sum of digits (legitimate):", df[df.Class == 0]['id_sum_digits'].mean())



ID patterns analysis:
Average ID length (fraud): 4.820627802690583
Average ID length (legitimate): 4.889052587269611
Average sum of digits (fraud): 21.780269058295964
Average sum of digits (legitimate): 22.50160858714934


In [5]:
# Find most common first digits for fraud cases
fraud_first_digits = df[df.Class == 1]['id_first_digit'].value_counts().head(3)
legit_first_digits = df[df.Class == 0]['id_first_digit'].value_counts().head(3)
print("\nMost common first digits in fraud IDs:", fraud_first_digits.index.tolist())
print("Most common first digits in legitimate IDs:", legit_first_digits.index.tolist())



Most common first digits in fraud IDs: [1, 4, 6]
Most common first digits in legitimate IDs: [2, 3, 5]


In [6]:
# Find most common last digits for fraud cases
fraud_last_digits = df[df.Class == 1]['id_last_digit'].value_counts().head(3)
legit_last_digits = df[df.Class == 0]['id_last_digit'].value_counts().head(3)
print("\nMost common last digits in fraud IDs:", fraud_last_digits.index.tolist())
print("Most common last digits in legitimate IDs:", legit_last_digits.index.tolist())



Most common last digits in fraud IDs: [1, 7, 9]
Most common last digits in legitimate IDs: [5, 2, 3]


In [7]:
# Build a model to detect fraud based on ID features
print("\nBuilding ID-based fraud detection model...")
features = ['id_length', 'id_first_digit', 'id_last_digit', 'id_sum_digits', 'id_num_unique_digits']



Building ID-based fraud detection model...


In [8]:
# Additional features from other columns (excluding Amount)
for col in df.columns:
    if col.startswith('V') and df[col].nunique() > 1:
        features.append(col)

X = df[features]
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print("Training Random Forest classifier...")
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)


Training Random Forest classifier...


In [9]:
# Evaluate model
y_pred = clf.predict(X_test)
print("\nModel performance on test set:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification report:")
print(classification_report(y_test, y_pred))



Model performance on test set:
[[29929     4]
 [    7    60]]

Classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     29933
           1       0.94      0.90      0.92        67

    accuracy                           1.00     30000
   macro avg       0.97      0.95      0.96     30000
weighted avg       1.00      1.00      1.00     30000



In [10]:
# Feature importance
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': clf.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nFeature importance:")
print(feature_importance.head(10))



Feature importance:
   Feature  Importance
18     V14    0.148038
21     V17    0.145795
16     V12    0.120628
15     V11    0.069875
20     V16    0.066143
14     V10    0.065691
13      V9    0.044680
30     V26    0.033918
7       V3    0.028118
8       V4    0.027271


In [11]:
# Save the model
print("\nSaving model...")
joblib.dump(clf, 'id_fraud_detection_model.pkl')



Saving model...


['id_fraud_detection_model.pkl']

In [12]:
# Define fraud detection rules based on analysis
print("\nDerived Fraud Detection Rules:")
print("Based on the analysis, these ID patterns could indicate fraud:")

# Create high-risk ranges for ID first digits
high_risk_first_digits = fraud_first_digits.index.tolist()
print(f"- First digit is one of: {high_risk_first_digits}")

# Create high-risk ranges for ID last digits
high_risk_last_digits = fraud_last_digits.index.tolist()
print(f"- Last digit is one of: {high_risk_last_digits}")

# ID length rules
fraud_id_length_mean = df[df.Class == 1]['id_length'].mean()
print(f"- ID length is approximately {int(round(fraud_id_length_mean))}")

# Sum of digits
fraud_sum_mean = df[df.Class == 1]['id_sum_digits'].mean()
print(f"- Sum of digits is approximately {int(round(fraud_sum_mean))}")



Derived Fraud Detection Rules:
Based on the analysis, these ID patterns could indicate fraud:
- First digit is one of: [1, 4, 6]
- Last digit is one of: [1, 7, 9]
- ID length is approximately 5
- Sum of digits is approximately 22
