In [1]:
import pandas as pd

In [2]:
df=pd.read_csv('credit_card_fraud_sample.csv')

In [3]:
df.head(10)

Unnamed: 0,TransactionID,TransactionDate,Amount,TransactionType,IsFraud
0,1,2025-08-10,2959.63,Withdrawal,0
1,2,2025-08-20,2676.74,Payment,0
2,3,2025-08-23,1114.33,Purchase,0
3,4,2025-08-09,2766.59,Payment,0
4,5,2025-08-11,3686.76,Purchase,0
5,6,2025-08-23,562.22,Purchase,0
6,7,2025-08-29,3641.83,Payment,0
7,8,2025-08-02,1470.69,Payment,0
8,9,2025-08-05,4366.49,Payment,0
9,10,2025-08-17,3089.72,Purchase,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   TransactionID    1000 non-null   int64  
 1   TransactionDate  1000 non-null   object 
 2   Amount           1000 non-null   float64
 3   TransactionType  1000 non-null   object 
 4   IsFraud          1000 non-null   int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 39.2+ KB


In [5]:
df['IsFraud'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
IsFraud,Unnamed: 1_level_1
0,0.944
1,0.056


In [6]:
df['TransactionDate']=pd.to_datetime(df['TransactionDate'])

In [9]:
display(df.head())

Unnamed: 0,TransactionID,TransactionDate,Amount,TransactionType,IsFraud
0,1,2025-08-10,2959.63,Withdrawal,0
1,2,2025-08-20,2676.74,Payment,0
2,3,2025-08-23,1114.33,Purchase,0
3,4,2025-08-09,2766.59,Payment,0
4,5,2025-08-11,3686.76,Purchase,0


In [10]:
df['DayOfWeek'] = df['TransactionDate'].dt.dayofweek
df['Day'] = df['TransactionDate'].dt.day

In [11]:
display(df.head())

Unnamed: 0,TransactionID,TransactionDate,Amount,TransactionType,IsFraud,DayOfWeek,Day
0,1,2025-08-10,2959.63,Withdrawal,0,6,10
1,2,2025-08-20,2676.74,Payment,0,2,20
2,3,2025-08-23,1114.33,Purchase,0,5,23
3,4,2025-08-09,2766.59,Payment,0,5,9
4,5,2025-08-11,3686.76,Purchase,0,0,11


In [12]:
df = pd.get_dummies(df, columns=['TransactionType'], drop_first=True)

In [13]:
display(df.head())

Unnamed: 0,TransactionID,TransactionDate,Amount,IsFraud,DayOfWeek,Day,TransactionType_Purchase,TransactionType_Withdrawal
0,1,2025-08-10,2959.63,0,6,10,False,True
1,2,2025-08-20,2676.74,0,2,20,False,False
2,3,2025-08-23,1114.33,0,5,23,True,False
3,4,2025-08-09,2766.59,0,5,9,False,False
4,5,2025-08-11,3686.76,0,0,11,True,False


In [14]:
df.isnull().sum()

Unnamed: 0,0
TransactionID,0
TransactionDate,0
Amount,0
IsFraud,0
DayOfWeek,0
Day,0
TransactionType_Purchase,0
TransactionType_Withdrawal,0


In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df['Amount'] = scaler.fit_transform(df[['Amount']])

In [16]:
X = df.drop(['TransactionID', 'TransactionDate', 'IsFraud'], axis=1)
y = df['IsFraud']

In [18]:
print("Features shape:", X.shape)
print("Target shape:", y.shape)
print(X.head())

Features shape: (1000, 5)
Target shape: (1000,)
     Amount  DayOfWeek  Day  TransactionType_Purchase  \
0  0.355255          6   10                     False   
1  0.153829          2   20                     False   
2 -0.958649          5   23                      True   
3  0.217805          5    9                     False   
4  0.872991          0   11                      True   

   TransactionType_Withdrawal  
0                        True  
1                       False  
2                       False  
3                       False  
4                       False  


In [17]:
scaler = StandardScaler()
df['Amount'] = scaler.fit_transform(df[['Amount']])

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Split data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Initialize the Random Forest Classifier
rfc = RandomForestClassifier(random_state=42)

# Train the model
rfc.fit(X_train, y_train)

# Predict on test data
y_pred = rfc.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")


Confusion Matrix:
[[187   2]
 [ 11   0]]

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.99      0.97       189
           1       0.00      0.00      0.00        11

    accuracy                           0.94       200
   macro avg       0.47      0.49      0.48       200
weighted avg       0.89      0.94      0.91       200

Accuracy: 0.9350
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000


The output shows that the model has high accuracy (93.5%) but fails to identify any fraud cases (Precision, Recall, and F1 Score for class 1 are all 0). This is a classic sign of class imbalance: most samples are non-fraud, so the model predicts almost everything as non-fraud.

To address the class imbalance and improve fraud detection in the model, i am using the following methods: SMOTE oversampling and class weighting with Random Forest.

In [20]:
from imblearn.over_sampling import SMOTE

In [21]:
# Apply SMOTE only to the training set
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [22]:
y_train.value_counts()
pd.Series(y_resampled).value_counts()


Unnamed: 0_level_0,count
IsFraud,Unnamed: 1_level_1
0,755
1,755


In [23]:
rfc_smote = RandomForestClassifier(random_state=42)
rfc_smote.fit(X_resampled, y_resampled)
y_pred_smote = rfc_smote.predict(X_test)

In [24]:
# Evaluate
print("=== SMOTE Results ===")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_smote))
print("\nClassification Report:\n", classification_report(y_test, y_pred_smote))

=== SMOTE Results ===
Confusion Matrix:
 [[155  34]
 [  9   2]]

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.82      0.88       189
           1       0.06      0.18      0.09        11

    accuracy                           0.79       200
   macro avg       0.50      0.50      0.48       200
weighted avg       0.90      0.79      0.83       200



Note on SMOTE Results

Applying SMOTE oversampling to the training data helps the model start recognizing some fraud cases.
The recall for the fraud class increased from 0 to 0.18, meaning the model now catches some frauds, but the precision is still low due to more false positives.
SMOTE improves minority class detection but requires careful tuning, and some trade-off is seen in non-fraud misclassifications.



In [25]:
# Using class_weight='balanced' in Random Forest:
rfc_weighted = RandomForestClassifier(random_state=42, class_weight='balanced')
rfc_weighted.fit(X_train, y_train)
y_pred_weighted = rfc_weighted.predict(X_test)

print("=== Class Weight Results ===")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_weighted))
print("\nClassification Report:\n", classification_report(y_test, y_pred_weighted))


=== Class Weight Results ===
Confusion Matrix:
 [[187   2]
 [ 11   0]]

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.99      0.97       189
           1       0.00      0.00      0.00        11

    accuracy                           0.94       200
   macro avg       0.47      0.49      0.48       200
weighted avg       0.89      0.94      0.91       200



Note on Class Weight Results

Using class_weight='balanced' in the Random Forest classifier does not improve fraud detection on this highly imbalanced dataset.
The model continues to predict almost all samples as non-fraud, resulting in high accuracy but 0 recall and precision for the fraud class.
This means the model is not effectively learning to identify fraudulent cases when only class weights are used.