In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [89]:
# Load the dataset
df = pd.read_csv("Synthetic Financial Datasets For Fraud Detection.csv")

print(f"Number of samples: {df.shape[0]}")
print(f"Number of features: {df.shape[1]}")
df.head()

Number of samples: 6362620
Number of features: 11


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [4]:
# selecting the columns of numerical type
numeric_columns = df.select_dtypes(include=['int', 'float']).columns

numeric_data = df[numeric_columns]

# pearson corrleation matrix of the numerical data
correlation = numeric_data.corr()

In [5]:
print(f"correlation between all features and Class \n{(correlation['isFraud'].sort_values(ascending=False))}")

correlation between all features and Class 
isFraud           1.000000
amount            0.076688
isFlaggedFraud    0.044109
step              0.031578
oldbalanceOrg     0.010154
newbalanceDest    0.000535
oldbalanceDest   -0.005885
newbalanceOrig   -0.008148
Name: isFraud, dtype: float64


In [6]:
target_correlation = correlation['isFraud'].drop('isFraud')

In [11]:
threshold= 0.02
features=target_correlation[abs(target_correlation) >= threshold].index

In [13]:
features=features[:2]

In [15]:
copy_df=df[features].join(df['type'])
copy_df=copy_df.join(df['isFraud'])
df=copy_df

In [17]:
# Encode the 'type' column
le = LabelEncoder()
df['type'] = le.fit_transform(df['type'])

In [19]:
df = df.dropna(subset=['isFraud'])

In [90]:
df['type'].value_counts()

type
CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: count, dtype: int64

In [21]:
# Separate feature variables and target variable
X = df.drop('isFraud', axis=1)
y = df['isFraud']

In [23]:
# Standardize the data
scaler = MinMaxScaler()
df['amount'] = scaler.fit_transform(df[['amount']])

In [25]:
# Initialize SMOTE and apply it to the training data only
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

In [27]:
df2 = pd.DataFrame(X_smote)
df2['isFraud'] = y_smote
df2

Unnamed: 0,step,amount,type,isFraud
0,1,9.839640e+03,3,0
1,1,1.864280e+03,3,0
2,1,1.810000e+02,4,1
3,1,1.810000e+02,1,1
4,1,1.166814e+04,3,0
...,...,...,...,...
12708809,253,1.620903e+06,1,1
12708810,617,1.000000e+07,1,1
12708811,110,9.041246e+05,1,1
12708812,48,1.367124e+05,2,1


In [29]:
df2.drop_duplicates(inplace=True)
df2.shape

(11260389, 4)

In [30]:
# Separate feature variables and target variable
X = df2.drop('isFraud', axis=1)
y = df2['isFraud']

In [33]:
# Split the data into train (80%), validation (10%), and test sets (10%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

In [62]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, accuracy_score # Import accuracy_score here
from sklearn.metrics import classification_report # you will likely need classification_report too
DT=DecisionTreeClassifier()
DT.fit(X_train, y_train)

In [63]:
# Validate the model on the validation set
y_val_pred = DT.predict(X_val)
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(f"Validation Classification Report:\n {classification_report(y_val, y_val_pred)}")

# Test the model on the test set
y_test_pred = DT.predict(X_test)
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred)}")
print(f"Test Classification Report:\n {classification_report(y_test, y_test_pred)}")


Validation Accuracy: 0.975628730443617
Validation Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98    634692
           1       0.96      0.98      0.97    491347

    accuracy                           0.98   1126039
   macro avg       0.97      0.98      0.98   1126039
weighted avg       0.98      0.98      0.98   1126039

Test Accuracy: 0.975656260573568
Test Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98    634692
           1       0.96      0.98      0.97    491347

    accuracy                           0.98   1126039
   macro avg       0.97      0.98      0.98   1126039
weighted avg       0.98      0.98      0.98   1126039



In [64]:
# Train the model on the test set
y_train_pred = DT.predict(X_train)
print(f"Traint Accuracy: {accuracy_score(y_train, y_train_pred)}")
print(f"Train Classification Report:\n {classification_report(y_train, y_train_pred)}")

Traint Accuracy: 0.9999924514151431
Train Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00   5077536
           1       1.00      1.00      1.00   3930775

    accuracy                           1.00   9008311
   macro avg       1.00      1.00      1.00   9008311
weighted avg       1.00      1.00      1.00   9008311



In [65]:
report_dict = classification_report(y_test, y_test_pred, output_dict=True)
report_dict

{'0': {'precision': 0.9863095314361859,
  'recall': 0.970278497286873,
  'f1-score': 0.978228340327925,
  'support': 634692.0},
 '1': {'precision': 0.9623970673539807,
  'recall': 0.982602926241536,
  'f1-score': 0.9723950413389594,
  'support': 491347.0},
 'accuracy': 0.975656260573568,
 'macro avg': {'precision': 0.9743532993950833,
  'recall': 0.9764407117642044,
  'f1-score': 0.9753116908334423,
  'support': 1126039.0},
 'weighted avg': {'precision': 0.9758753302323206,
  'recall': 0.975656260573568,
  'f1-score': 0.9756829809235604,
  'support': 1126039.0}}

In [56]:
import mlflow

In [68]:
mlflow.set_experiment("Fraud Detection")
mlflow.set_tracking_uri("http://127.0.0.1:5000/")

with mlflow.start_run():
    mlflow.log_metrics({
        'accuracy': report_dict['accuracy'],
        'recall_class_0': report_dict['0']['recall'],
        'recall_class_1': report_dict['1']['recall'],
        'f1_score_macro': report_dict['macro avg']['f1-score']
    })
    mlflow.sklearn.log_model(DT, "Decision Tree")  

2024/10/16 20:50:25 INFO mlflow.tracking._tracking_service.client: 🏃 View run likeable-swan-420 at: http://127.0.0.1:5000/#/experiments/551150578853535820/runs/07bf752539fb48ebb1c8178dc1a16fac.
2024/10/16 20:50:25 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/551150578853535820.


In [75]:
# Log metrics and model in an MLflow run
mlflow.set_experiment("Fraud Detection")
mlflow.set_tracking_uri("http://127.0.0.1:5000/")

with mlflow.start_run():
    # Calculate metrics
    accuracy_val = accuracy_score(y_val, y_val_pred)
    report_dict = classification_report(y_val, y_val_pred, output_dict=True)
    
    # Log metrics
    mlflow.log_metrics({
        'accuracy': accuracy_val,
        'recall_class_0': report_dict['0']['recall'],
        'recall_class_1': report_dict['1']['recall'],
        'f1_score_macro': report_dict['macro avg']['f1-score']
    })
    
    # Log the model
    mlflow.sklearn.log_model(DT, "Decision Tree")

    # Register the model
    model_uri = f"runs:/{mlflow.active_run().info.run_id}/Decision Tree"  # Get the model URI
    client = MlflowClient()
    client.create_registered_model("FraudDetectionModel")  # Give your model a name
    client.create_model_version("FraudDetectionModel", model_uri, "1")  #

2024/10/16 21:04:17 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: FraudDetectionModel, version 1
2024/10/16 21:04:17 INFO mlflow.tracking._tracking_service.client: 🏃 View run gregarious-cow-975 at: http://127.0.0.1:5000/#/experiments/551150578853535820/runs/8afe83e6b23b4627b29a917c299d75a3.
2024/10/16 21:04:17 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/551150578853535820.


In [85]:
import mlflow.pyfunc

# Load the model (use the correct model name and version)
loaded_model = mlflow.pyfunc.load_model("models:/FraudDetectionModel/1")  # or "Production" if applicable



# Make predictions
predictions = loaded_model.predict([[1, 9839.64, 3]])
print(predictions)


[0]


