## Imports

In [1]:
import pandas as pd
import numpy as np
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

## Reading CSV and making it into a dataframe 

In [16]:
data = pd.read_csv('DATATHON_EVENT_DATASET.csv')


## Checking the types of data in the frame

In [17]:
data.dtypes

Time                             int64
Transaction_Type                object
Amount                         float64
Origin_ID                       object
Initial_Origin_Balance         float64
Final_Origin_Balance           float64
Destination_ID                  object
Initial_Destination_Balance    float64
Final_Destination_Balance      float64
Fraud                           object
Expected_Fraud                  object
dtype: object

## Checking the Expected Fraud

In [18]:
data["Expected_Fraud"].describe()

count     53500
unique        2
top          No
freq      53484
Name: Expected_Fraud, dtype: object

## Checking the Time - Bound must be between 1 and 744

In [19]:
data["Time"].describe()

count    58150.000000
mean       260.806569
std        161.090714
min          1.000000
25%        157.000000
50%        253.000000
75%        354.000000
max        743.000000
Name: Time, dtype: float64

## Checking the Unique items in the Transaction Types

In [20]:
data["Transaction_Type"].unique() 

array(['CASH_OUT', 'PAYMENT', 'CASH_IN', 'TRANSFER', 'DEBIT',
       'CASH_OUUUT', 'Trans_fer'], dtype=object)

## Checking Minimum and Maximum Amount of Transaction

In [21]:
data["Amount"].describe()

count    5.815000e+04
mean     3.613641e+05
std      1.163212e+06
min      0.000000e+00
25%      1.625524e+04
50%      9.645477e+04
75%      2.558266e+05
max      6.988673e+07
Name: Amount, dtype: float64

## Converting the ID's of Origin and Destination
 1. If the id starts with "C" then it is marked as 0
 2. If the id starts with "D" then it is marked as 1

In [22]:
data['Origin_ID'] = data['Origin_ID'].astype(str)
data['Destination_ID'] = data['Destination_ID'].astype(str)
for index, i in enumerate(data["Origin_ID"]):
    if i[0].lower() == 'c':
        data.at[index, 'Origin_ID'] = 0
    elif i[0].lower() == 'm':
        data.at[index, 'Origin_ID'] = 1

for index, i in enumerate(data["Destination_ID"]):
    if i[0].lower() == 'c':
        data.at[index, 'Destination_ID'] = 0
    elif i[0].lower() == 'm':
        data.at[index, 'Destination_ID'] = 1
data

Unnamed: 0,Time,Transaction_Type,Amount,Origin_ID,Initial_Origin_Balance,Final_Origin_Balance,Destination_ID,Initial_Destination_Balance,Final_Destination_Balance,Fraud,Expected_Fraud
0,162,CASH_OUT,183806.32,0,19391.00,0.00,0,382572.19,566378.51,No,No
1,137,PAYMENT,521.37,0,0.00,0.00,1,0.00,0.00,No,No
2,179,PAYMENT,3478.18,0,19853.00,16374.82,1,0.00,0.00,No,No
3,355,PAYMENT,1716.05,0,5769.17,4053.13,1,0.00,0.00,No,No
4,354,CASH_IN,253129.93,0,1328499.49,1581629.42,0,2713220.48,2460090.55,No,No
...,...,...,...,...,...,...,...,...,...,...,...
58145,743,CASH_OUT,339682.13,0,339682.13,0.00,0,0.00,339682.13,Yes,No
58146,743,TRANSFER,6311409.28,0,6311409.28,0.00,0,0.00,0.00,Yes,No
58147,743,CASH_OUT,6311409.28,0,6311409.28,0.00,0,68488.84,6379898.11,Yes,No
58148,743,TRANSFER,850002.52,0,850002.52,0.00,0,0.00,0.00,Yes,No


## Checking the Intial and Final Balances of the Origin and Destination 
1. If the Id is Merchant then the final and initial need not be checked
2. If the Id is Customer then we remove the rows with the inital and final balance as ZERO since it doesn't make any sense

In [23]:
data = data[~((data['Origin_ID'] == 0) & (data['Initial_Origin_Balance'] == 0) & (data['Final_Origin_Balance'] == 0))]
data = data[~((data['Destination_ID'] == 0) & (data['Initial_Destination_Balance'] == 0) & (data['Final_Destination_Balance'] == 0))]
data

Unnamed: 0,Time,Transaction_Type,Amount,Origin_ID,Initial_Origin_Balance,Final_Origin_Balance,Destination_ID,Initial_Destination_Balance,Final_Destination_Balance,Fraud,Expected_Fraud
0,162,CASH_OUT,183806.32,0,19391.00,0.00,0,382572.19,566378.51,No,No
2,179,PAYMENT,3478.18,0,19853.00,16374.82,1,0.00,0.00,No,No
3,355,PAYMENT,1716.05,0,5769.17,4053.13,1,0.00,0.00,No,No
4,354,CASH_IN,253129.93,0,1328499.49,1581629.42,0,2713220.48,2460090.55,No,No
5,321,PAYMENT,24768.57,0,21573.00,0.00,1,0.00,0.00,No,No
...,...,...,...,...,...,...,...,...,...,...,...
58141,742,CASH_OUT,63416.99,0,63416.99,0.00,0,276433.18,339850.17,Yes,No
58143,743,CASH_OUT,1258818.82,0,1258818.82,0.00,0,503464.50,1762283.33,Yes,
58145,743,CASH_OUT,339682.13,0,339682.13,0.00,0,0.00,339682.13,Yes,No
58147,743,CASH_OUT,6311409.28,0,6311409.28,0.00,0,68488.84,6379898.11,Yes,No


## Checking the intial and final balances
1. If the CASH IN then I remove if the Final Origin Balance and Initial Destination Balance is ZERO
2. If the CASH OUT then I remove if the Inital Origin Balance and Final Destination Balance is ZERO

In [24]:
data = data[
    ~(
        # For CASH_IN transactions
        ((data["Transaction_Type"] == 'CASH_IN') & 
         (data['Final_Origin_Balance'] == 0) |
         (data['Initial_Destination_Balance'] == 0)) |
        # For CASH_OUT transactions
        ((data['Transaction_Type'] == 'CASH_OUT') & 
         (data['Final_Destination_Balance'] == 0) | 
         (data['Initial_Origin_Balance'] == 0))
    )
]
data

Unnamed: 0,Time,Transaction_Type,Amount,Origin_ID,Initial_Origin_Balance,Final_Origin_Balance,Destination_ID,Initial_Destination_Balance,Final_Destination_Balance,Fraud,Expected_Fraud
0,162,CASH_OUT,183806.32,0,19391.00,0.00,0,382572.19,566378.51,No,No
4,354,CASH_IN,253129.93,0,1328499.49,1581629.42,0,2713220.48,2460090.55,No,No
9,347,CASH_IN,266999.97,0,7411271.89,7678271.86,0,386812.88,119812.91,No,No
10,15,CASH_IN,53230.73,0,895081.62,948312.35,0,7015146.49,6961915.77,No,No
14,41,DEBIT,819.35,0,274494.00,273674.65,0,923509.57,924328.92,No,
...,...,...,...,...,...,...,...,...,...,...,...
58139,742,CASH_OUT,258355.42,0,258355.42,0.00,0,25176.67,283532.09,Yes,No
58141,742,CASH_OUT,63416.99,0,63416.99,0.00,0,276433.18,339850.17,Yes,No
58143,743,CASH_OUT,1258818.82,0,1258818.82,0.00,0,503464.50,1762283.33,Yes,
58147,743,CASH_OUT,6311409.28,0,6311409.28,0.00,0,68488.84,6379898.11,Yes,No


## Encoding data for the model
1. Variables used
   - Transaction_Type
   - Origin_ID
   - Destination_ID
   - Fraud
   
- Removing the Expecetd_Fraud column since it doesnt make any sense- all the values are NO 

In [25]:
# Drop the 'Expected_Fraud' column as it contains many missing values and isn't critical
data_cleaned = data.drop(columns=['Expected_Fraud'])

# Encode categorical variables ('Transaction_Type', 'Origin_ID', 'Destination_ID', 'Fraud')
label = LabelEncoder()

data_cleaned['Transaction_Type'] = label.fit_transform(data_cleaned['Transaction_Type'])
data_cleaned['Origin_ID'] = label.fit_transform(data_cleaned['Origin_ID'])
data_cleaned['Destination_ID'] = label.fit_transform(data_cleaned['Destination_ID'])
data_cleaned['Fraud'] = label.fit_transform(data_cleaned['Fraud'])

## Model Fitting and Scoring

In [11]:
# Define the features (X) and target (y)
x = data_cleaned.drop('Fraud',axis=1)
y = data_cleaned['Fraud']

# Split the data into training and testing sets (80% training, 20% testing)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators = 100)
model.fit(x_train,y_train)
model.score(x_test,y_test)

0.9906765479321061

## Performance Metrics

In [2]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

# Load a sample dataset
# Define your model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Perform 5-Fold Cross-Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Cross-Validation with Accuracy
accuracy_scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
print("Accuracy Scores:", accuracy_scores)
print("Mean Accuracy Score:", accuracy_scores.mean())

# Cross-Validation with Precision (weighted)
precision_scorer = make_scorer(precision_score, average='weighted')
precision_scores = cross_val_score(model, X, y, cv=cv, scoring=precision_scorer)
print("Precision Scores:", precision_scores)
print("Mean Precision Score:", precision_scores.mean())


# Cross-Validation with Recall (weighted)
recall_scorer = make_scorer(recall_score, average='weighted')
recall_scores = cross_val_score(model, X, y, cv=cv, scoring=recall_scorer)
print("Recall Scores:", recall_scores)
print("Mean Recall Score:", recall_scores.mean())

# Cross-Validation with F1 Score (weighted)
f1_scorer = make_scorer(f1_score, average='weighted')
f1_scores = cross_val_score(model, X, y, cv=cv, scoring=f1_scorer)
print("F1 Scores:", f1_scores)
print("Mean F1 Score:", f1_scores.mean())


Accuracy Scores: [0.96666667 0.96666667 0.93333333 0.96666667 0.9       ]
Mean Accuracy Score: 0.9466666666666667
Precision Scores: [0.96969697 0.96969697 0.94444444 0.96969697 0.9023569 ]
Mean Precision Score: 0.9511784511784512
Recall Scores: [0.96666667 0.96666667 0.93333333 0.96666667 0.9       ]
Mean Recall Score: 0.9466666666666667
F1 Scores: [0.96658312 0.96658312 0.93265993 0.96658312 0.89974937]
Mean F1 Score: 0.9464317359054201


## Predicting Fraud for test dataset

In [None]:
import pandas as pd

# Assuming you have already trained your model and split the data
# Generate predictions
model.fit(x_train,y_train)
y_pred = model.predict(x_test)

# Convert predictions to a pandas Series
y_pred_series = pd.Series(y_pred, name='Predictions')

# Display the Series
print(y_pred_series)


## ROC Curve

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# Get probabilities for positive class
y_proba = model.predict_proba(x_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_proba)
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

print("AUC-ROC:", roc_auc)

# Plot ROC Curve
plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc:.2f})')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()


## Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming you have already trained your model and split the data
# Get the predictions from your model
y_pred = model.predict(x_test)

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)

# Optional: Visualize the confusion matrix using a heatmap
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraud', 'Fraud'], yticklabels=['Not Fraud', 'Fraud'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()


## Exporting the Model as .pkl(pickle)

In [37]:
with open('Model.pkl','wb') as file:
    pickle.dump(model,file)