In [1]:
!pip install imbalanced-learn




In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.tree import DecisionTreeClassifier


In [3]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
data = pd.read_csv('/content/drive/MyDrive/Fraud.csv')

In [None]:
# Drop irrelevant columns
columns_to_drop = ["step", "nameOrig", "nameDest", "oldbalanceDest", "newbalanceDest", "isFlaggedFraud"]
data = data.drop(columns=columns_to_drop)

In [None]:
data

Unnamed: 0,type,amount,oldbalanceOrg,newbalanceOrig
0,PAYMENT,9839.64,170136.00,160296.36
1,PAYMENT,1864.28,21249.00,19384.72
2,TRANSFER,181.00,181.00,0.00
3,CASH_OUT,181.00,181.00,0.00
4,PAYMENT,11668.14,41554.00,29885.86
...,...,...,...,...
6362615,CASH_OUT,339682.13,339682.13,0.00
6362616,TRANSFER,6311409.28,6311409.28,0.00
6362617,CASH_OUT,6311409.28,6311409.28,0.00
6362618,TRANSFER,850002.52,850002.52,0.00


In [None]:
# Handle categorical variables
categorical_columns = ['type']
encoder = OneHotEncoder(sparse=False)
encoded_categories = encoder.fit_transform(data[categorical_columns])



In [None]:
# Create a DataFrame with the encoded categories
encoded_df = pd.DataFrame(encoded_categories, columns=encoder.get_feature_names_out(categorical_columns))

In [None]:
# Concatenate the encoded categories with the original data
data = pd.concat([data.drop(columns=categorical_columns), encoded_df], axis=1)

In [None]:
# Separate features and target
X = data.drop(columns=['isFraud'])
y = data['isFraud']
y.value_counts()

isFraud
0    6354407
1       8213
Name: count, dtype: int64

In [None]:
fraud_count = sum(y == 1)
print(f'Number of fraud cases: {fraud_count}')

Number of fraud cases: 8213


In [None]:
# Balance the dataset by undersampling non-fraud cases
fraud_indices = np.where(y == 1)[0]
non_fraud_indices = np.where(y == 0)[0]

In [None]:
# Select a random sample of non-fraud cases to match the number of fraud cases
undersample_non_fraud_indices = np.random.choice(non_fraud_indices, size=fraud_count, replace=False)

In [None]:
# Combine the indices
balanced_indices = np.concatenate([fraud_indices, undersample_non_fraud_indices])
X_balanced, y_balanced = X.iloc[balanced_indices], y.iloc[balanced_indices]
y_balanced[y_balanced == 1].count()

8213

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

In [None]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Train the Random Forest model using sklearn
model = RandomForestClassifier(n_estimators=100, max_depth=10, max_features='sqrt', random_state=42)
model.fit(X_train_scaled, y_train)

In [None]:
# Predict on the same data (or split into train/test if you prefer)
y_pred = model.predict(X_test_scaled)

In [None]:
# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[1612   25]
 [   9 1640]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      1637
           1       0.98      0.99      0.99      1649

    accuracy                           0.99      3286
   macro avg       0.99      0.99      0.99      3286
weighted avg       0.99      0.99      0.99      3286



In [None]:
#Without Python Package

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data = pd.read_csv('/content/drive/MyDrive/Fraud.csv')

In [5]:
# Drop irrelevant columns
columns_to_drop = ["step", "nameOrig", "nameDest", "oldbalanceDest", "newbalanceDest", "isFlaggedFraud"]
data = data.drop(columns=columns_to_drop)

In [6]:
# Handle categorical variables
categorical_columns = ['type']
encoder = OneHotEncoder(sparse=False)
encoded_categories = encoder.fit_transform(data[categorical_columns])



In [7]:
# Create a DataFrame with the encoded categories
encoded_df = pd.DataFrame(encoded_categories, columns=encoder.get_feature_names_out(categorical_columns))

In [8]:
# Concatenate the encoded categories with the original data
data = pd.concat([data.drop(columns=categorical_columns), encoded_df], axis=1)

In [9]:
# Separate features and target
X = data.drop(columns=['isFraud'])
y = data['isFraud']


In [10]:
# Identify the number of fraud cases
fraud_count = sum(y == 1)
print(f'Number of fraud cases: {fraud_count}')

Number of fraud cases: 8213


In [11]:
# Balance the dataset by undersampling non-fraud cases
fraud_indices = np.where(y == 1)[0]
non_fraud_indices = np.where(y == 0)[0]

In [12]:
# Select a random sample of non-fraud cases to match the number of fraud cases
undersample_non_fraud_indices = np.random.choice(non_fraud_indices, size=fraud_count, replace=False)

In [13]:
# Combine the indices
balanced_indices = np.concatenate([fraud_indices, undersample_non_fraud_indices])
X_balanced, y_balanced = X.iloc[balanced_indices], y.iloc[balanced_indices]

In [14]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

In [15]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [16]:
# Random Forest Implementation
class RandomForestScratch:
    def __init__(self, n_estimators=100, max_depth=None, max_features='sqrt'):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.trees = []

    def bootstrap_sample(self, X, y):
        n_samples = X.shape[0]
        indices = np.random.choice(n_samples, size=n_samples, replace=True)
        return X[indices], y[indices]

    def fit(self, X, y):
        self.trees = []
        for i in range(self.n_estimators):
            print(f"Training tree {i + 1}/{self.n_estimators}")
            tree = DecisionTreeClassifier(max_depth=self.max_depth, max_features=self.max_features)
            X_sample, y_sample = self.bootstrap_sample(X, y)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)
            print(f"Tree {i + 1} trained")


    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        # Majority vote
        return np.squeeze(np.apply_along_axis(lambda x: Counter(x).most_common(1)[0][0], axis=0, arr=tree_preds))

In [18]:
# Train the Random Forest model
model = RandomForestScratch(n_estimators=100, max_depth=10, max_features='sqrt')
model.fit(X_train_scaled, y_train.values)

Training tree 1/100
Tree 1 trained
Training tree 2/100
Tree 2 trained
Training tree 3/100
Tree 3 trained
Training tree 4/100
Tree 4 trained
Training tree 5/100
Tree 5 trained
Training tree 6/100
Tree 6 trained
Training tree 7/100
Tree 7 trained
Training tree 8/100
Tree 8 trained
Training tree 9/100
Tree 9 trained
Training tree 10/100
Tree 10 trained
Training tree 11/100
Tree 11 trained
Training tree 12/100
Tree 12 trained
Training tree 13/100
Tree 13 trained
Training tree 14/100
Tree 14 trained
Training tree 15/100
Tree 15 trained
Training tree 16/100
Tree 16 trained
Training tree 17/100
Tree 17 trained
Training tree 18/100
Tree 18 trained
Training tree 19/100
Tree 19 trained
Training tree 20/100
Tree 20 trained
Training tree 21/100
Tree 21 trained
Training tree 22/100
Tree 22 trained
Training tree 23/100
Tree 23 trained
Training tree 24/100
Tree 24 trained
Training tree 25/100
Tree 25 trained
Training tree 26/100
Tree 26 trained
Training tree 27/100
Tree 27 trained
Training tree 28/10

In [19]:
# Predict on the test set
y_pred = model.predict(X_test_scaled)

In [20]:
# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[1610   27]
 [   9 1640]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      1637
           1       0.98      0.99      0.99      1649

    accuracy                           0.99      3286
   macro avg       0.99      0.99      0.99      3286
weighted avg       0.99      0.99      0.99      3286

