In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

print("Libraries imported")
df = pd.read_csv("Fraud.csv")
print("Dataset loaded")


Libraries imported
Dataset loaded


In [34]:
df.head(5)
# DISPLAY THE FIRST 5 ROWS OF THE DATASET

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [37]:
df.isnull().sum()
# will give how many missing values each column has.
# Output =0 for all columns, so no missing values.

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [38]:
df[['amount', 'oldbalanceOrg', 'newbalanceOrig','oldbalanceDest', 'newbalanceDest']].describe()
# Outliers were analyzed using descriptive statistics.
# Outliers are data values that are too high or too low compared to normal data.
# some transactions have extremely high values,especially in transaction amount 
# and balance-related features.

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,179861.9,833883.1,855113.7,1100702.0,1224996.0
std,603858.2,2888243.0,2924049.0,3399180.0,3674129.0
min,0.0,0.0,0.0,0.0,0.0
25%,13389.57,0.0,0.0,0.0,0.0
50%,74871.94,14208.0,0.0,132705.7,214661.4
75%,208721.5,107315.2,144258.4,943036.7,1111909.0
max,92445520.0,59585040.0,49585040.0,356015900.0,356179300.0


In [None]:
df[['amount', 'oldbalanceOrg', 'newbalanceOrig','oldbalanceDest', 'newbalanceDest']].corr()
# Multicollinearity was checked using Correlation analysis among numerical features.
# Strong correlations were observed between balance-related variables. 
# However, since these features provide important information for fraud detection,
# they were retained for model training.

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest
amount,1.0,-0.002762,-0.007861,0.294137,0.459304
oldbalanceOrg,-0.002762,1.0,0.998803,0.066243,0.042029
newbalanceOrig,-0.007861,0.998803,1.0,0.067812,0.041837
oldbalanceDest,0.294137,0.066243,0.067812,1.0,0.976569
newbalanceDest,0.459304,0.042029,0.041837,0.976569,1.0


<!-- Q2. Describe your fraud detection model in elaboration 
 
A supervised machine learning classification model was developed using Logistic Regression to detect fraudulent transactions. The model was trained on historical transaction data labeled as fraudulent or non-fraudulent. Logistic Regression was selected due to its simplicity, interpretability, and suitability for binary classification problems such as fraud detection.-->

<!-- Q3. How did you select variables to be included in the model?

Feature selection was performed using domain knowledge and data relevance. Identifier columns such as sender and receiver IDs were removed because they do not contribute to fraud prediction. The target variable isFraud was separated, and transaction-related features such as transaction type, amount, and balance information were used for model training. -->

In [None]:
# Drop identifier columns
df_model = df.drop(['nameOrig', 'nameDest'], axis=1)

# Encode categorical variable
df_model['type'] = df_model['type'].astype('category').cat.codes

# Separate features and target
X = df_model.drop('isFraud', axis=1)
y = df_model['isFraud']

print("Features selected and encoded")


Features selected and encoded


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train-test split completed")



Train-test split completed


In [42]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

print("Model trained successfully")


Model trained successfully


In [43]:
from sklearn.metrics import confusion_matrix, classification_report
y_pred = model.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[1270823      81]
 [    842     778]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270904
           1       0.91      0.48      0.63      1620

    accuracy                           1.00   1272524
   macro avg       0.95      0.74      0.81   1272524
weighted avg       1.00      1.00      1.00   1272524



<!-- Q4. Demonstrate the performance of the model by using best set of tools.

The model performance was evaluated using 
a confusion matrix,precision, recall, and F1-score. 
Since the dataset is highly imbalanced, accuracy alone is not sufficient.
Recall was given higher importance to ensure fraudulent transactions are correctly identified.  -->

<!-- Q5. What are the key factors that predict fraudulent customers?

The key factors that predict fraudulent transactions include transaction amount,  transaction type, and sudden changes in account balances. Fraudulent transactions often involve unusually high amounts and abnormal balance reductions in the sender’s account followed by increases in the recipient’s balance. -->

<!-- Q6.  Do these factors make sense? If yes, how? If not, how not?

Yes, these factors make sense because:
- Transaction amount and frequency are key indicators of fraudulent behavior.
- Geographic location helps identify suspicious transactions that deviate from normal user behavior.
- Time of transaction can reveal patterns inconsistent with typical user activity.
- Device information and IP address can help detect unauthorized access or unusual login patterns.  -->

<!-- Q7. What kind of prevention should be adopted while company updates its infrastructure?

To prevent fraud,the company should: 
-implement real-time transaction monitoring systems,
-set transaction limits,
-multi-factor authentication.
-Behavioral analytics and continuous monitoring of suspicious activities can further strengthen fraud prevention.
-Machine learning models should be retrained periodically to adapt to evolving fraud patterns.-->

<!-- Q8. Assuming these actions have been implemented, how would you determine if they work?

The effectiveness of fraud prevention strategies can be measured by
-monitoring a reduction in fraudulent transactions.
-improvements in recall and precision scores, and a decrease in financial losses due to fraud 
-Customer complaints and false positive rates can also be tracked to ensure that genuine transactions are not adversely affected. -->