In [3]:
!pip install --upgrade pip
!pip install scikit-learn
!pip install pandas
!pip install imbalanced-learn

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.14.0-py3-none-any.whl.metadata (8.8 kB)
Downloading imbalanced_learn-0.14.0-py3-none-any.whl (239 kB)
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.14.0


In [25]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV

In [26]:
df = pd.read_csv("final_train.csv")
labels = ['Online', 'Channel', 'MCC_Group', 'Amount',
              'Type', 'Balance', 'Limit', 'Type_dst', 'Balance_dst',
              'Limit_dst', 'Age', 'Tenure', 'CreditScore', 'AnnualSalary',
              'Device_Count', 'Action_Count', 'Login_Count', 'Logout_Count',
              'Account_View_Count', 'Payment_Count', 'Transfer_Count',
              'Day_of_Week', 'Timestamp', 'in_home_city', 'prev_trans_in_city',
              'times_device_used', 'unique_devices_past', 'days_since_open',
              'days_since_open_dst', 'time_since_last_txn_hours',
              'xacts_earlier_today', 'avg_amount_past', 'amount_ratio',
              'amount_dev', 'mcc_group_prev_count']
X = df[labels]
y = df['FraudLabel']

In [38]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [39]:
scoring = {
    'recall': make_scorer(recall_score),
    'precision': make_scorer(precision_score),
    'f1': make_scorer(f1_score),
    }

In [21]:
rf_weighted = RandomForestClassifier(
    n_estimators = 100,
    class_weight = 'balanced',
    random_state = 42,
    n_jobs = -1
)

cv_results_weighted = cross_validate(rf_weighted, X, y, cv=cv, scoring=scoring, n_jobs=-1)
print("Weighted RF results:")
print(pd.DataFrame(cv_results_weighted).mean())

Weighted RF results:
fit_time          36.058946
score_time         0.527663
test_recall        0.218909
test_precision     1.000000
test_f1            0.358082
dtype: float64


In [43]:
param_grid = {
    'rf__n_estimators': [100, 300, 500],        # Number of trees
    'rf__max_depth': [5, 10, 20],         # Maximum depth per tree
    'rf__min_samples_leaf': [1, 2, 4],          # Min samples required in a leaf
    'rf__max_features': ['sqrt', 'log2'],  # Number of features to consider per split
    'rf__class_weight': ['balanced']      # Weighting to handle imbalance
}

In [44]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
pipeline_rf = Pipeline([
    ('rf', rf)
])
grid_search = GridSearchCV(
    estimator = pipeline_rf, param_grid = param_grid, scoring=scoring,refit = 'f1',cv=cv, n_jobs=-1, verbose=2
)

In [45]:
grid_search.fit(X, y)
print("Best Parameters:", grid_search.best_params_)
print("Best F1 Score:", grid_search.best_score_)

Fitting 5 folds for each of 486 candidates, totalling 2430 fits
[CV] END rf__bootstrap=True, rf__class_weight=balanced, rf__max_depth=5, rf__max_features=sqrt, rf__min_samples_leaf=1, rf__min_samples_split=2, rf__n_estimators=100; total time=  22.5s
[CV] END rf__bootstrap=True, rf__class_weight=balanced, rf__max_depth=5, rf__max_features=sqrt, rf__min_samples_leaf=1, rf__min_samples_split=2, rf__n_estimators=100; total time=  22.5s
[CV] END rf__bootstrap=True, rf__class_weight=balanced, rf__max_depth=5, rf__max_features=sqrt, rf__min_samples_leaf=1, rf__min_samples_split=2, rf__n_estimators=300; total time= 1.1min
[CV] END rf__bootstrap=True, rf__class_weight=balanced, rf__max_depth=5, rf__max_features=sqrt, rf__min_samples_leaf=1, rf__min_samples_split=2, rf__n_estimators=300; total time= 1.2min
[CV] END rf__bootstrap=True, rf__class_weight=balanced, rf__max_depth=5, rf__max_features=sqrt, rf__min_samples_leaf=1, rf__min_samples_split=2, rf__n_estimators=300; total time= 1.1min
[CV] E

KeyboardInterrupt: 

In [None]:
results = pd.DataFrame(grid_search.cv_results_)
results[['mean_test_precision', 'mean_test_recall', 'mean_test_f1', 'params']]

In [24]:
imputer = SimpleImputer(strategy='median')
smote = SMOTE(random_state=42)

rf = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

pipeline_smote = Pipeline([
    ('imputer', imputer),
    ('smote', smote),
    ('rf', rf)
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = {
    'recall': make_scorer(recall_score),
    'precision': make_scorer(precision_score),
    'f1': make_scorer(f1_score)
}

cv_results_smote = cross_validate(
    pipeline_smote, X, y, cv=cv, scoring=scoring, n_jobs=-1
)

print(pd.DataFrame(cv_results_smote).mean()[['test_recall', 'test_precision', 'test_f1']])

test_recall       0.226713
test_precision    0.990476
test_f1           0.368051
dtype: float64
