In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


In [8]:
def stratified_sample(X, y, sample_size=0.05, random_state=42):

    # Get a stratified sample of the data
    X_sampled, _, y_sampled, _ = train_test_split(X, y, 
                                                  test_size=1-sample_size, 
                                                  stratify=y, 
                                                  random_state=random_state)
    
    # Further split the sampled data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_sampled, y_sampled, 
                                                        test_size=0.3,  # or another ratio
                                                        stratify=y_sampled, 
                                                        random_state=random_state)
    
    return X_train, X_test, y_train, y_test

In [3]:
data = pd.read_csv('../data/imbalanced_90_10.csv')


In [27]:
augmented_data = pd.read_csv('../data/augmented_data.csv')

In [9]:
X = data.drop(columns=['id','Class'])
y = data['Class']

In [10]:
X_train, X_test, y_train, y_test = stratified_sample(X,y)

In [11]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print("Performance on Original Data:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Performance on Original Data:
Accuracy: 0.9921
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      4265
           1       0.99      0.92      0.96       427

    accuracy                           0.99      4692
   macro avg       0.99      0.96      0.98      4692
weighted avg       0.99      0.99      0.99      4692



In [28]:
X_augmented = augmented_data.drop(columns=['Class'])
y_augmented = augmented_data['Class']

In [29]:
X_train_augmented, X_test_augmented, y_train_augmented, y_test_augmented = stratified_sample(X_augmented,y_augmented)

In [30]:
model_augmented = RandomForestClassifier(random_state=42)
model_augmented.fit(X_train_augmented, y_train_augmented)

# Predict and evaluate
y_pred_augmented = model.predict(X_test_augmented)
print("Performance on Original Data:")
print(f"Accuracy: {accuracy_score(y_test_augmented, y_pred_augmented):.4f}")
print("Classification Report:")
print(classification_report(y_test_augmented, y_pred_augmented))

Performance on Original Data:
Accuracy: 0.8454
Classification Report:
              precision    recall  f1-score   support

           0       0.82      1.00      0.90      4265
           1       1.00      0.49      0.65      1828

    accuracy                           0.85      6093
   macro avg       0.91      0.74      0.78      6093
weighted avg       0.87      0.85      0.83      6093



In [14]:
augmented_data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,1.000292,-0.385827,0.603715,-0.345033,0.127467,0.084401,0.398448,-0.142299,0.678887,0.48096,...,-0.395597,-0.22914,-0.795047,0.16301,0.016033,0.055825,0.305349,-0.256975,-0.043482,3140.253263
1,-0.240276,-0.729487,1.062137,-0.228653,1.703406,-0.140686,0.392668,-0.202936,0.240343,0.648434,...,0.469859,-0.027674,0.21265,-0.068556,-1.584222,-0.051797,-0.686191,-0.291417,-0.188311,20298.583182
2,0.011851,-0.353437,1.25677,-1.439101,0.272571,-0.133467,0.783832,-0.313326,-0.251035,1.214118,...,-0.019974,0.011123,0.813147,-0.306883,1.196577,0.243814,-0.576313,-0.953474,-1.072127,22234.532322
3,-1.965575,2.438907,-1.940155,2.108773,-2.338812,-0.012511,-2.810852,-1.162597,-2.477627,-2.296516,...,1.728229,-2.258812,1.723187,1.800895,1.024274,-2.568047,-1.147263,-2.878728,-1.617369,4012.687903
4,1.051824,-0.350502,0.410508,-0.447473,0.390159,0.462387,0.434,-0.159024,0.52227,0.45944,...,-0.221444,-0.221827,-0.808026,0.002763,-1.555959,0.382071,0.424082,-0.244213,-0.056951,11822.660177
