In [21]:
# ===============================================
# Candidate Name: Anki Prakash
# Assignment: Predicting Fraud Transactions
# Submission Date: 24th August 2025
# Email: ankiprakash03@gmail.com

# ===============================================

# This notebook contains my solutions for the Accredian screening assignment.


In [23]:
# The assignment covers data analysis, visualization, and machine learning tasks as outlined in the instructions.

In [3]:
import os

# List all files in current directory
os.listdir()


['Data Dictionary.txt',
 'Fraud_Detection.ipynb',
 '.ipynb_checkpoints',
 'Fraud.csv']

In [None]:
import pandas as pd

# Load the full dataset
df = pd.read_csv('Fraud.csv')  # use exact file name from your folder

# Sample 200,000 rows for sharing (or less if you want faster execution)
df_sample = df.sample(n=200000, random_state=42)

# Save the sample dataset
df_sample.to_csv('fraud_sample.csv', index=False)
print("Sample dataset created successfully!")


In [1]:
!pip install pandas



In [2]:
import pandas as pd

df = pd.read_csv("Fraud.csv")

df.head()





Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [3]:

print("Shape of dataset:", df.shape)

Shape of dataset: (6362620, 11)


In [4]:
# 1. Column names & data types
print("\nColumn info:")
print(df.info())


Column info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB
None


In [5]:
# 2. Check missing values
print("\nMissing values:")
print(df.isnull().sum())


Missing values:
step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64


In [6]:
# 3. Summary statistics
print("\nSummary statistics:")
print(df.describe())


Summary statistics:
               step        amount  oldbalanceOrg  newbalanceOrig  \
count  6.362620e+06  6.362620e+06   6.362620e+06    6.362620e+06   
mean   2.433972e+02  1.798619e+05   8.338831e+05    8.551137e+05   
std    1.423320e+02  6.038582e+05   2.888243e+06    2.924049e+06   
min    1.000000e+00  0.000000e+00   0.000000e+00    0.000000e+00   
25%    1.560000e+02  1.338957e+04   0.000000e+00    0.000000e+00   
50%    2.390000e+02  7.487194e+04   1.420800e+04    0.000000e+00   
75%    3.350000e+02  2.087215e+05   1.073152e+05    1.442584e+05   
max    7.430000e+02  9.244552e+07   5.958504e+07    4.958504e+07   

       oldbalanceDest  newbalanceDest       isFraud  isFlaggedFraud  
count    6.362620e+06    6.362620e+06  6.362620e+06    6.362620e+06  
mean     1.100702e+06    1.224996e+06  1.290820e-03    2.514687e-06  
std      3.399180e+06    3.674129e+06  3.590480e-02    1.585775e-03  
min      0.000000e+00    0.000000e+00  0.000000e+00    0.000000e+00  
25%      0.00000

In [7]:
# 4. Fraud vs Non-Fraud counts
fraud_counts = df['isFraud'].value_counts()
print("Fraud vs Non-Fraud:\n", fraud_counts)

# 5. Percentage distribution
fraud_percentage = (fraud_counts / len(df)) * 100
print("\nFraud percentage:\n", fraud_percentage)



Fraud vs Non-Fraud:
 isFraud
0    6354407
1       8213
Name: count, dtype: int64

Fraud percentage:
 isFraud
0    99.870918
1     0.129082
Name: count, dtype: float64


In [8]:
# 6. Fraud distribution by transaction type
fraud_by_type = df.groupby('type')['isFraud'].sum()
total_by_type = df['type'].value_counts()

fraud_summary = pd.DataFrame({
    'Total Transactions': total_by_type,
    'Fraud Transactions': fraud_by_type,
    'Fraud Rate (%)': (fraud_by_type / total_by_type * 100).round(4)
})

print(fraud_summary)


          Total Transactions  Fraud Transactions  Fraud Rate (%)
type                                                            
CASH_IN              1399284                   0          0.0000
CASH_OUT             2237500                4116          0.1840
DEBIT                  41432                   0          0.0000
PAYMENT              2151495                   0          0.0000
TRANSFER              532909                4097          0.7688


In [9]:
#dropping nameOrig and nameDest. They are just IDs of accounts (not useful for prediction, too many unique values.

In [10]:
# 7. Drop useless ID columns
df = df.drop(['nameOrig', 'nameDest'], axis=1)

# 8. Confirm the new shape
print("New shape:", df.shape)


New shape: (6362620, 9)


In [11]:
# 9. Check consistency: oldbalanceOrg - amount vs newbalanceOrg
df['diff_orig'] = df['oldbalanceOrg'] - df['amount'] - df['newbalanceOrig']

# 10. Check consistency: oldbalanceDest + amount vs newbalanceDest
df['diff_dest'] = df['oldbalanceDest'] + df['amount'] - df['newbalanceDest']

# 11. Look at unusual values
print("Unique values in diff_orig:", df['diff_orig'].nunique())
print("Unique values in diff_dest:", df['diff_dest'].nunique())

# 12. Show some inconsistent rows
df_inconsistent = df[(df['diff_orig'] != 0) | (df['diff_dest'] != 0)]
print("\nNumber of inconsistent rows:", len(df_inconsistent))
df_inconsistent.head()


Unique values in diff_orig: 4717358
Unique values in diff_dest: 3170212

Number of inconsistent rows: 6240563


Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,diff_orig,diff_dest
0,1,PAYMENT,9839.64,170136.0,160296.36,0.0,0.0,0,0,0.0,9839.64
1,1,PAYMENT,1864.28,21249.0,19384.72,0.0,0.0,0,0,0.0,1864.28
2,1,TRANSFER,181.0,181.0,0.0,0.0,0.0,1,0,0.0,181.0
3,1,CASH_OUT,181.0,181.0,0.0,21182.0,0.0,1,0,0.0,21363.0
4,1,PAYMENT,11668.14,41554.0,29885.86,0.0,0.0,0,0,0.0,11668.14


In [12]:
# 13. Convert transaction type to dummy variables
df = pd.get_dummies(df, columns=['type'], drop_first=True)

# 14. Check the new shape and columns
print("New shape:", df.shape)
print("\nColumns:", df.columns.tolist())


New shape: (6362620, 14)

Columns: ['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'isFraud', 'isFlaggedFraud', 'diff_orig', 'diff_dest', 'type_CASH_OUT', 'type_DEBIT', 'type_PAYMENT', 'type_TRANSFER']


In [13]:
!pip install scikit-learn




In [14]:
## - from sklearn.model_selection import train_test_split

# Features (drop target column)
## - X = df.drop(['isFraud'], axis=1)

# Target
## - y = df['isFraud']

# Split data: 70% training, 30% testing
## - X_train, X_test, y_train, y_test = train_test_split(
## -    X, y, test_size=0.3, random_state=42, stratify=y
## - )

## - print("Train shape:", X_train.shape, y_train.shape)
## - print("Test shape:", X_test.shape, y_test.shape)   


In [15]:
# **Baseline Model (logistic regression)**


In [16]:
# Use a smaller sample for quick training
df_sample = df.sample(n=200000, random_state=42)

X = df_sample.drop(['isFraud'], axis=1)
y = df_sample['isFraud']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print("Sampled Train shape:", X_train.shape, y_train.shape)
print("Sampled Test shape:", X_test.shape, y_test.shape)


Sampled Train shape: (140000, 13) (140000,)
Sampled Test shape: (60000, 13) (60000,)


In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Logistic Regression with class weight to handle imbalance
log_model = LogisticRegression(max_iter=100, solver='liblinear', class_weight='balanced')

print("Training model on sample...")
log_model.fit(X_train, y_train)

# Predictions
y_pred = log_model.predict(X_test)

# Evaluation
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))
print("\nROC-AUC Score:", roc_auc_score(y_test, y_pred))


Training model on sample...

Confusion Matrix:
 [[57692  2228]
 [    5    75]]

Classification Report:
               precision    recall  f1-score   support

           0     0.9999    0.9628    0.9810     59920
           1     0.0326    0.9375    0.0629        80

    accuracy                         0.9628     60000
   macro avg     0.5162    0.9502    0.5220     60000
weighted avg     0.9986    0.9628    0.9798     60000


ROC-AUC Score: 0.9501585447263018


In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Train Random Forest (limit trees for speed)
rf_model = RandomForestClassifier(
    n_estimators=100, 
    max_depth=10, 
    class_weight='balanced',
    random_state=42,
    n_jobs=-1  # use all CPU cores
)

print("Training Random Forest...")
rf_model.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluation
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf, digits=4))
print("\nROC-AUC Score:", roc_auc_score(y_test, y_pred_rf))

# Feature importance
import pandas as pd
feat_importances = pd.Series(rf_model.feature_importances_, index=X_train.columns)
print("\nTop Features:\n", feat_importances.sort_values(ascending=False).head(10))


Training Random Forest...

Confusion Matrix:
 [[59920     0]
 [    0    80]]

Classification Report:
               precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000     59920
           1     1.0000    1.0000    1.0000        80

    accuracy                         1.0000     60000
   macro avg     1.0000    1.0000    1.0000     60000
weighted avg     1.0000    1.0000    1.0000     60000


ROC-AUC Score: 1.0

Top Features:
 diff_orig         0.394982
newbalanceOrig    0.160183
oldbalanceOrg     0.114913
type_TRANSFER     0.065454
type_PAYMENT      0.064479
amount            0.063046
step              0.045144
diff_dest         0.028290
newbalanceDest    0.022861
type_CASH_OUT     0.022069
dtype: float64


In [19]:
!pip install xgboost




In [20]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Train XGBoost on sampled data
xgb_model = XGBClassifier(
    n_estimators=200,
    max_depth=8,
    learning_rate=0.1,
    scale_pos_weight=len(y_train[y_train==0]) / len(y_train[y_train==1]), # handle imbalance
    random_state=42,
    n_jobs=-1,
    use_label_encoder=False,
    eval_metric='logloss'
)

print("Training XGBoost...")
xgb_model.fit(X_train, y_train)

# Predictions
y_pred_xgb = xgb_model.predict(X_test)

# Evaluation
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_xgb, digits=4))
print("\nROC-AUC Score:", roc_auc_score(y_test, y_pred_xgb))

# Feature importance
import pandas as pd
feat_importances = pd.Series(xgb_model.feature_importances_, index=X_train.columns)
print("\nTop Features:\n", feat_importances.sort_values(ascending=False).head(10))


Training XGBoost...


Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.
Parameters: { "use_label_encoder" } are not used.




Confusion Matrix:
 [[59907    13]
 [    1    79]]

Classification Report:
               precision    recall  f1-score   support

           0     1.0000    0.9998    0.9999     59920
           1     0.8587    0.9875    0.9186        80

    accuracy                         0.9998     60000
   macro avg     0.9293    0.9936    0.9592     60000
weighted avg     0.9998    0.9998    0.9998     60000


ROC-AUC Score: 0.9936415220293724

Top Features:
 diff_orig         0.670655
newbalanceOrig    0.299523
diff_dest         0.019600
type_PAYMENT      0.004341
amount            0.002718
oldbalanceDest    0.001158
type_DEBIT        0.000840
oldbalanceOrg     0.000632
type_CASH_OUT     0.000230
type_TRANSFER     0.000150
dtype: float32


## Q2. Describe Your Fraud Detection Model  

We implemented three machine learning models:  

- <font color="darkslategray"><b>Logistic Regression</b></font> → used as a *baseline model*.  
  - Strengths: simple, interpretable, efficient.  
  - Weaknesses: struggled with extreme class imbalance → very low precision (**3.2%**) but very high recall (**93.7%**).  
  - ROC-AUC: **0.95**.  

- <font color="darkslategray"><b>Random Forest</b></font> → a *tree-based ensemble model*.  
  - Captured non-linear relationships and feature interactions.  
  - Achieved **perfect classification** on the sampled dataset (Precision = 1.0, Recall = 1.0, ROC-AUC = 1.0).  
  - Risk: may be slightly overfitting, since real-world performance is rarely perfect.  

- <font color="darkslategray"><b>XGBoost</b></font> → a *gradient boosting model*.  
  - Handled imbalance effectively with weighted training.  
  - Achieved Precision = **85.9%**, Recall = **98.8%**, F1 = **91.9%**, and ROC-AUC = **0.994**.  
  - Provided more realistic and reliable performance compared to Random Forest.  

> **Conclusion**: <font color="darkslategray"><b>XGBoost</b></font> delivered the best trade-off between fraud detection (recall) and minimizing false alarms (precision), making it the most suitable for deployment.  

---

## Q3. How Did You Select Variables?  

We began with 11 columns in the raw dataset.  

- Dropped irrelevant identifiers:  
  - `nameOrig`, `nameDest` (customer IDs, too many unique values, not predictive).  

- Encoded categorical transaction type:  
  - Converted `type` into dummy variables: <font color="darkslategray"><b>CASH_OUT</b></font>, <font color="darkslategray"><b>TRANSFER</b></font>, etc.  

- Engineered new features:  
  - <font color="darkslategray"><b>diff_orig</b></font> = `oldbalanceOrg – amount – newbalanceOrig`  
  - <font color="darkslategray"><b>diff_dest</b></font> = `oldbalanceDest + amount – newbalanceDest`  

These features capture **balance inconsistencies**, which turned out to be the **most important predictors of fraud**.  

Final dataset: **14 features** → transaction type, amount, balances, and engineered differences.  

---

## Q4. Demonstrate Model Performance  

We evaluated models using a **train/test split (70/30)** and key metrics: confusion matrix, precision, recall, F1-score, and ROC-AUC.  

### Model Comparison

| Model                | Precision (Fraud) | Recall (Fraud) | F1 (Fraud) | ROC-AUC |
|----------------------|-------------------|----------------|------------|---------|
| Logistic Regression  | **0.032**         | **0.938**      | 0.063      | 0.95    |
| Random Forest        | **1.000**         | **1.000**      | 1.000      | 1.00    |
| XGBoost              | **0.859**         | **0.988**      | 0.919      | 0.994   |

> - <font color="darkslategray"><b>Logistic Regression</b></font>: good recall but very poor precision.  
> - <font color="darkslategray"><b>Random Forest</b></font>: perfect performance on sample, but likely overfitting.  
> - <font color="darkslategray"><b>XGBoost</b></font>: excellent balance of recall and precision with very high ROC-AUC, making it the **most reliable choice**.  


## Q5. What are the key factors that predict fraudulent customers?  

Based on <font color="darkslategray"><b>feature importance</b></font> from Random Forest and XGBoost models, the most predictive factors are:  

- <font color="darkslategray"><b>diff_orig</b></font> → inconsistency between sender’s old balance, transaction amount, and new balance.  
- <font color="darkslategray"><b>newbalanceOrig</b></font> → if sender’s balance drops unusually (often to zero).  
- <font color="darkslategray"><b>oldbalanceOrg</b></font> → fraudsters often target accounts with higher starting balances.  
- <font color="darkslategray"><b>type_TRANSFER</b></font> and <font color="darkslategray"><b>type_CASH_OUT</b></font> → fraud only occurs in these transaction types.  
- <font color="darkslategray"><b>amount</b></font> → unusually large transfers are more likely to be fraudulent.  

---

## Q6. Do these factors make sense?  

Yes ✅ — these predictors align with real-world fraud behavior:  

- Fraudsters often **empty victim accounts** → hence *diff_orig* and *newbalanceOrig* are strong signals.  
- Fraud requires **movement of money** → *TRANSFER* followed by *CASH-OUT* transactions mirror real fraud scenarios.  
- **High-value accounts** are prime targets → *oldbalanceOrg* is relevant.  
- **Large transactions** are riskier by nature → *amount* matters.  

Thus, the identified features are not only statistically significant but also *logically consistent* with financial fraud patterns.  

---

## Q7. What kind of prevention should be adopted while company updates its infrastructure?  

The company should adopt a multi-layered fraud prevention strategy:  

1. **Real-time Transaction Monitoring**  
   - Deploy the trained <font color="darkslategray"><b>XGBoost fraud model</b></font> in production.  
   - Flag suspicious transfers and cash-outs immediately.  

2. **Transaction Limits & Rules**  
   - Block or flag unusually large transfers (e.g., > $200k).  
   - Limit frequency of transactions within short time windows.  

3. **Customer Authentication Enhancements**  
   - Introduce two-factor authentication (2FA) for high-value transfers.  
   - Alert customers of suspicious activity via SMS/email.  

4. **Infrastructure Safeguards**  
   - Regular anomaly detection audits.  
   - Continuous model retraining on new data to adapt to evolving fraud tactics.  

---

## Q8. Assuming these actions have been implemented, how would you determine if they work?  

Effectiveness can be measured using both *statistical metrics* and *business KPIs*:  

- **Reduction in Fraud Rate** → compare % of fraudulent transactions *before vs after* implementation.  
- **False Positive Analysis** → track how many genuine transactions are unnecessarily blocked/flagged.  
- **Customer Feedback** → monitor customer complaints about account lockouts or blocked transactions.  
- **Model Monitoring**  
  - Monitor recall (fraud detection rate) and precision (false alarm rate).  
  - Recalculate ROC-AUC periodically to ensure performance stability.  

> Success is achieved if the system maintains **high recall** (catches most fraud) while steadily reducing **false positives** and overall fraud losses.



#  Final Conclusion  

In this project, we developed a fraud detection system for financial transactions using machine learning.  

### 🔹 Data Preparation  
- Dataset: **6.3 million transactions**, 11 raw features.  
- Cleaning: removed irrelevant identifiers (`nameOrig`, `nameDest`).  
- Feature Engineering: created <font color="darkslategray"><b>diff_orig</b></font> and <font color="darkslategray"><b>diff_dest</b></font> to capture balance inconsistencies.  
- Encoding: converted categorical <font color="darkslategray"><b>type</b></font> column into dummy variables.  
- Final dataset: **14 features**.  

### 🔹 Modeling  
We tested three machine learning models:  

- **Logistic Regression (baseline)** → High recall (93.7%) but very poor precision (3.2%).  
- **Random Forest** → Perfect classification (Precision = 1.0, Recall = 1.0), but likely overfitting.  
- **XGBoost** → Balanced performance with Precision = 85.9%, Recall = 98.8%, F1 = 91.9%, ROC-AUC = 0.994.  

### 🔹 Key Insights  
- Fraud occurs **only in TRANSFER and CASH-OUT** transactions.  
- The most important fraud predictors were:  
  - <font color="darkslategray"><b>diff_orig</b></font> (balance mismatch for sender)  
  - <font color="darkslategray"><b>newbalanceOrig</b></font> (sender’s balance drops sharply)  
  - <font color="darkslategray"><b>oldbalanceOrg</b></font> (fraudsters target accounts with higher starting balances)  
  - <font color="darkslategray"><b>transaction amount</b></font> and <font color="darkslategray"><b>transaction type</b></font>  

### 🔹 Business Recommendations  
- Deploy <font color="darkslategray"><b>XGBoost</b></font> for **real-time fraud detection**.  
- Implement **transaction limits** and **frequency checks** for high-risk transactions.  
- Require **two-factor authentication (2FA)** for large transfers.  
- Continuously **monitor model performance** and retrain with new fraud patterns.  

### 🔹 Final Verdict  
<font color="darkslategray"><b>XGBoost</b></font> is the most suitable model for deployment. It balances fraud detection power (high recall) with reduced false alarms (high precision), making it both **effective and reliable** for proactive fraud prevention.  

---
✅ **Project Completed**: Data Cleaning → Modeling → Interpretation → Recommendations.  
