## Random Forest Exploration
This notebook evaluates whether a non-linear ensemble (Random Forest) provides
meaningful improvements over logistic regression, given hardware and compute constrains.

In [1]:
# Importing the downloaded data from Kaggle
import pandas as pd
df = pd.read_csv("../data/creditcard.csv")
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Separate features and target
X = df.drop("Class", axis=1)   # assuming 'Class' is the label
y = df["Class"]

# Train–test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=50, stratify=y
)

#Fit scaler on training data only
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

In [3]:
# Using SMOTE for class imbalance
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=50, k_neighbors=5)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

[WinError 2] The system cannot find the file specified
  File "C:\Users\Vaishnavi\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "C:\Users\Vaishnavi\anaconda3\Lib\subprocess.py", line 554, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Vaishnavi\anaconda3\Lib\subprocess.py", line 1039, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                        start_new_session, process_group)
                        ^^^^^^^^^^^^^^^

In [4]:
import pandas as pd
X_train_smote_df = pd.DataFrame(X_train_smote, columns=X_train.columns)
y_train_smote_sr = pd.Series(y_train_smote, name="target")
n_original = len(X_train_scaled)
print(n_original)

199364


In [5]:
X_train_smote_df.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
      dtype='object')

In [6]:
import numpy as np
is_original = np.arange(len(X_train_smote_df)) < n_original
X_train_smote_df["is_original"] = is_original
X_train_smote_df[X_train_smote_df['is_original']==False]

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,is_original
199364,1.413858,0.513670,-0.191584,-0.692890,-0.430233,0.057651,-0.307472,0.091250,-0.034340,0.027063,...,0.179966,0.422524,0.491732,0.791796,-0.599166,-0.395737,0.050808,0.133889,0.050697,False
199365,-0.777657,-2.638496,-0.143050,-2.571326,3.740714,-5.433100,2.891171,1.319879,0.013499,-2.548130,...,0.219142,1.712661,-1.774093,-0.082667,-0.652637,0.890843,3.840468,-3.278482,5.269484,False
199366,1.547622,0.684211,0.810322,-3.310242,1.091240,0.304907,-0.992447,-1.147247,0.231530,-1.053749,...,0.506016,0.048077,-0.204995,-0.090242,0.900268,1.422340,0.945673,0.518235,-0.342176,False
199367,-0.371902,0.030242,0.838018,-0.652070,1.421308,1.258755,-0.493169,0.368652,0.030844,-0.359262,...,-0.443953,-0.956630,-0.371829,-1.521698,0.748432,-0.412022,0.275331,0.321380,-0.341504,False
199368,1.280015,-0.500190,3.091246,-5.385275,5.108693,0.125166,-1.553427,-2.454151,1.463718,-4.064389,...,0.702901,-0.675063,0.032601,-1.892505,0.112759,1.117347,1.293620,0.399456,-0.342403,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
398035,-0.752255,0.304369,0.558933,-0.008388,1.543894,0.334196,-0.201913,0.100440,0.184628,-0.702244,...,0.008703,-0.068580,-0.129247,-0.291959,0.617870,-0.027813,-0.023543,0.133395,-0.344205,False
398036,-0.558355,-1.302583,0.386965,-2.476653,2.749544,-1.787077,-0.932275,-2.537207,0.712280,-1.964642,...,0.974433,0.110299,1.531079,0.225032,-0.821167,-0.671386,1.890358,0.522632,1.045738,False
398037,-0.719288,-1.178003,-2.011858,-2.353662,2.244191,-2.008679,-0.164926,-0.698783,0.273642,-0.725882,...,1.665281,-0.963772,-2.357683,-0.116647,0.440763,1.101008,0.737277,1.597718,5.089672,False
398038,1.116084,0.617068,1.647715,-3.609284,3.839692,0.920516,-1.180513,-0.822104,0.128453,-2.501158,...,0.322107,-0.580361,-0.230338,-0.064846,1.169091,0.406801,1.213158,0.869423,-0.339422,False


In [7]:
# Random Forest Model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
rf = RandomForestClassifier(
    n_estimators =300,
    max_depth = None,
    min_samples_leaf =5,
    min_samples_split =10,
    n_jobs =-1,
    max_features="sqrt",
    random_state =50,
    class_weight = None
)
rf.fit(X_train_smote, y_train_smote)
y_proba_rf = rf.predict_proba(X_test_scaled)[:,1]
y_pred_rf = (y_proba_rf >= 0.5).astype(int)

In [8]:
print("AUC", roc_auc_score(y_test,y_proba_rf).round(4))

AUC 0.9754


In [9]:
print(classification_report(y_test, y_pred_rf, digits=4))

              precision    recall  f1-score   support

           0     0.9997    0.9997    0.9997     85295
           1     0.8333    0.8446    0.8389       148

    accuracy                         0.9994     85443
   macro avg     0.9165    0.9222    0.9193     85443
weighted avg     0.9994    0.9994    0.9994     85443



In [10]:
print(confusion_matrix(y_test,y_pred_rf))

[[85270    25]
 [   23   125]]


In [11]:
import pandas as pd
import numpy as np

importances = rf.feature_importances_
feat_imp = pd.Series(importances,index =X_train.columns).sort_values(ascending=False)
print(feat_imp)

V14       0.151033
V10       0.123341
V4        0.105139
V12       0.102750
V17       0.098602
V11       0.081278
V16       0.047652
V3        0.046259
V7        0.032339
V2        0.024397
V9        0.018299
V18       0.018282
V21       0.015414
V8        0.014337
V19       0.012998
V5        0.011187
V1        0.011105
Amount    0.010426
V13       0.009036
V6        0.008949
V15       0.007266
V26       0.006679
Time      0.006615
V20       0.006378
V28       0.005951
V27       0.005604
V23       0.005421
V25       0.005197
V24       0.004110
V22       0.003956
dtype: float64


In [None]:
# Hyper parameters tuning
# As SMOTE will be computationally high we will use class weights
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold 
from sklearn.metrics import make_scorer, f1_score, roc_auc_score
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

In [None]:
cv = StratifiedKFold(n_splits =3, shuffle= True,random_state=50)

In [None]:
%%time
param_grid = {
    "n_estimators" : [200,300],
    "max_depth" : [6,14],
    "min_samples_leaf" :[10,50]
}
scorer = make_scorer(roc_auc_score, needs_proba=True)

rf = RandomForestClassifier(random_state =50,n_jobs =1,class_weight ="balanced_subsample")

random_search = RandomizedSearchCV(
    estimator =rf,
    param_distributions =param_grid,
    n_iter =15,
    scoring =scorer,
    cv=cv,
    n_jobs=1,
    verbose=2,
    random_state =50,
    return_train_score =True
)

random_search.fit(X_train_scaled,y_train)

print("Best params:", grid.best_params_)
print("Best CV ROC-AUC (fraud, CV):", grid.best_score_)
best_rf = grid.best_estimator_

## Conclusion
While Random Forests offer modeling flexibility, their computational cost on this dataset
(especially under cross validation) outweighs the marginal performance gains on local hardware.
For this project, logistic regression remains the preferred model.