In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
import joblib

In [16]:
import os
print(os.getcwd())

/home/db7219e7-e34e-4775-bf62-39d6b450178c/C:\Temp


In [52]:
# Step 1: Load and Preprocess Data
df = pd.read_csv('fraud_dataset_example.csv')

In [58]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [28]:
# Handle missing values if any
df.fillna(0, inplace=True)

In [30]:
# Feature Engineering
df['balance_diff_orig'] = df['oldbalanceOrg'] - df['newbalanceOrig']
df['balance_diff_dest'] = df['oldbalanceDest'] - df['newbalanceDest']
df['amount_balance_ratio'] = df['amount'] / (df['oldbalanceOrg'] + 1)  # +1 to avoid division by zero

In [60]:
# Encoding Categorical Variables
df = pd.get_dummies(df, columns=['type'], drop_first=True)

In [62]:
# Define features and target
X = df.drop(['isFraud', 'isFlaggedFraud', 'nameOrig', 'nameDest'], axis=1)
y = df['isFraud']

In [64]:
# Step 2: Model Training and Evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [66]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [67]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

In [69]:
print(classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_proba))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     30452
           1       0.93      0.44      0.60        32

    accuracy                           1.00     30484
   macro avg       0.97      0.72      0.80     30484
weighted avg       1.00      1.00      1.00     30484

ROC-AUC Score: 0.99893223351504


In [72]:
# Save the model
joblib.dump(model, 'fraud_detection_model.pkl')

['fraud_detection_model.pkl']

In [74]:
# Step 3: Export Data for Power BI
df['prediction'] = model.predict(X)
df['prediction_proba'] = model.predict_proba(X)[:, 1]

df.to_csv('fraud_data_with_predictions.csv', index=False)

In [76]:
# Export model metrics
metrics = {
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred),
    'recall': recall_score(y_test, y_pred),
    'f1_score': f1_score(y_test, y_pred),
    'roc_auc': roc_auc_score(y_test, y_pred_proba)
}

# Convert metrics dictionary to a DataFrame
metrics_df = pd.DataFrame(list(metrics.items()), columns=['Metric', 'Value'])

# Save metrics to a CSV file
metrics_df.to_csv('model_metrics.csv', index=False)