In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


In [27]:
data = pd.read_csv('../data/imbalanced_90_10.csv')
augmented_data = pd.read_csv('../data/augmented_data.csv')

In [28]:
X = data.drop(columns=['id','Class'])
y = data['Class']

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [31]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print("Performance on Original Data:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Performance on Original Data:
Accuracy: 0.9940
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     17059
           1       0.99      0.94      0.97      1706

    accuracy                           0.99     18765
   macro avg       0.99      0.97      0.98     18765
weighted avg       0.99      0.99      0.99     18765



In [22]:
def stratified_sample(X, y, sample_size=0.2, random_state=42):

    # Get a stratified sample of the data
    X_sampled, _, y_sampled, _ = train_test_split(X, y, 
                                                  test_size=1-sample_size, 
                                                  stratify=y, 
                                                  random_state=random_state)
    
    # Further split the sampled data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_sampled, y_sampled, 
                                                        test_size=0.3,  # or another ratio
                                                        stratify=y_sampled, 
                                                        random_state=random_state)
    
    return X_train, X_test, y_train, y_test


In [30]:
X_train, X_test, y_train, y_test = stratified_sample(X,y)

In [18]:
count = 0
for element in y_train:
    if element ==1:
        count +=1
print(count)

995


In [32]:
X_augmented = augmented_data.drop(columns=['Class'])
y_augmented = augmented_data['Class']

In [33]:
X_train_augmented, X_test_augmented, y_train_augmented, y_test_augmented = stratified_sample(X_augmented,y_augmented)

In [34]:
model_augmented = RandomForestClassifier(random_state=42)
model_augmented.fit(X_train_augmented, y_train_augmented)

# Predict and evaluate
y_pred_augmented = model.predict(X_test_augmented)
print("Performance on Original Data:")
print(f"Accuracy: {accuracy_score(y_test_augmented, y_pred_augmented):.4f}")
print("Classification Report:")
print(classification_report(y_test_augmented, y_pred_augmented))

Performance on Original Data:
Accuracy: 0.0935
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.09      0.17     26442

    accuracy                           0.09     26442
   macro avg       0.50      0.05      0.09     26442
weighted avg       1.00      0.09      0.17     26442



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
