# Outlier detection using PSOD and Isolation Forest on IEEE-CIS fraud dataset
https://www.kaggle.com/competitions/ieee-fraud-detection/overview

In [1]:
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    matthews_corrcoef,
    mean_absolute_error,
    mean_squared_error,
    median_absolute_error,
    r2_score,
    recall_score,
    roc_auc_score,
)

In [2]:
import gc
from typing import Dict, List, Union, Literal

import numpy as np
import pandas as pd
from category_encoders import TargetEncoder
from sklearn.linear_model import LinearRegression
from tqdm import tqdm

class PSOD:
    def __init__(
        self,
        n_jobs=-1,
        cat_columns=None,
        min_cols_chosen: float = 0.5,
        max_cols_chosen: float = 1.0,
        stdevs_to_outlier: float = 1.96,
        log_transform: bool = True,
        random_seed: int = 1,
        flag_outlier_on: Literal["low end", "both ends", "high end"] = "both ends"
    ):
        self.cat_columns = cat_columns
        self.cat_encoders: Dict[Union[str, int, float], TargetEncoder] = {}
        self.regressors: Dict[Union[str, int, float], LinearRegression] = {}
        self.n_jobs = n_jobs
        self.scores = None
        self.outlier_classes = None
        self.min_cols_chosen = min_cols_chosen
        self.max_cols_chosen = max_cols_chosen
        self.chosen_columns: List[list] = []
        self.stdevs_to_outlier = stdevs_to_outlier
        self.log_transform = log_transform
        self.flag_outlier_on = flag_outlier_on
        self.random_seed = random_seed
        self.random_generator = np.random.default_rng(self.random_seed)
        
        if self.max_cols_chosen > 1.0:
            raise ValueError("Param max_cols_chosen cannot be higher than 1.")
        
        if self.min_cols_chosen <= 0:
            raise ValueError("Param min_cols_chosen must be higher than 0.")
        
        if self.flag_outlier_on not in ["low end", "both ends", "high end"]:
            raise ValueError('Param flag_outlier_on must be any of ["low end", "both ends", "high end"].')

    def get_range_cols(self, df):
        len_cols = len(df.columns) - 1  # taking out the "target" column
        self.min_cols_chosen: int = max(int(len_cols * self.min_cols_chosen), 1)
        self.max_cols_chosen: int = min(int(len_cols * self.max_cols_chosen), len_cols)

    def chose_random_columns(self, df) -> list:
        """
        Select random columns.

        Randomize number of columns to chose from as well as the columns chosen.
        :return: list object with chosen column names
        """
        nb_cols: int = self.random_generator.choice(
            np.arange(self.min_cols_chosen, self.max_cols_chosen) + 1, 1, replace=False
        )
        return self.random_generator.choice(df.columns, nb_cols, replace=False).tolist()

    def col_intersection(self, lst1, lst2) -> list:
        chosen_cat_cols = [value for value in lst1 if value in lst2]
        return chosen_cat_cols

    def make_outlier_classes(self, df_scores: pd.DataFrame):
        mean_score = df_scores["anomaly"].mean()
        std_score = df_scores["anomaly"].std()
        
        if self.flag_outlier_on == "both ends":
            conditions = [
                df_scores["anomaly"] < mean_score - self.stdevs_to_outlier * std_score,
                df_scores["anomaly"] > mean_score + self.stdevs_to_outlier * std_score
            ]
        elif self.flag_outlier_on == "low end":
            conditions = [
                df_scores["anomaly"] < mean_score - self.stdevs_to_outlier * std_score
            ]
        elif self.flag_outlier_on == "high end":
            conditions = [
                df_scores["anomaly"] > mean_score + self.stdevs_to_outlier * std_score
            ]
        else:
            raise ValueError('Param flag_outlier_on must be any of ["low end", "both ends", "high end"].')
        
        choices = [1 for i in conditions]
        df_scores["anomaly_class"] = np.select(conditions, choices, default=0)
        self.outlier_classes = df_scores["anomaly_class"]
        return df_scores["anomaly_class"]

    def drop_cat_columns(self, df_scores: pd.DataFrame) -> pd.DataFrame:
        if isinstance(self.cat_columns, list):
            df_scores["anomaly"] = df_scores.drop(self.cat_columns, axis=1).mean(axis=1)
        else:
            df_scores["anomaly"] = df_scores.mean(axis=1)

        self.scores = df_scores["anomaly"]
        return df_scores

    def fit_predict(self, df, return_class=False) -> pd.Series:
        df_scores = df.copy()
        self.get_range_cols(df)
        if isinstance(self.cat_columns, list):
            loop_cols = df.drop(self.cat_columns, axis=1).columns
        else:
            loop_cols = df.columns

        if self.log_transform and isinstance(self.cat_columns, list):
            df.drop(self.cat_columns, axis=1).loc[:, :] = np.log1p(
                df.drop(self.cat_columns, axis=1).loc[:, :]
            )

        for enum, col in tqdm(enumerate(loop_cols), total=len(loop_cols)):
            self.chosen_columns.append(self.chose_random_columns(df.drop(col, axis=1)))
            temp_df = df.copy()
            # encode categorical columns that are in chosen columns
            if isinstance(self.cat_columns, list):
                chosen_cat_cols = self.col_intersection(
                    self.cat_columns, self.chosen_columns[enum]
                )

            idx = df_scores.sample(frac=1.0, random_state=enum, replace=True).index

            if isinstance(self.cat_columns, list):
                enc = TargetEncoder(cols=chosen_cat_cols)
                temp_df.loc[:, chosen_cat_cols] = enc.fit_transform(
                    df.loc[:, chosen_cat_cols].iloc[idx].reset_index(drop=True),
                    df.loc[:, col].iloc[idx].reset_index(drop=True),
                )

            reg = LinearRegression(n_jobs=self.n_jobs).fit(
                temp_df.loc[:, self.chosen_columns[enum]].iloc[idx],
                temp_df[col].iloc[idx],
            )
            df_scores[col] = reg.predict(temp_df.loc[:, self.chosen_columns[enum]])
            df_scores[col] = abs(temp_df[col] - df_scores[col])
            self.regressors[col] = reg
            if isinstance(self.cat_columns, list):
                self.cat_encoders[col] = enc
            del temp_df
            _ = gc.collect()

        df_scores = self.drop_cat_columns(df_scores)

        if return_class:
            return self.make_outlier_classes(df_scores)
        else:
            return df_scores["anomaly"]

    def predict(self, df, return_class=False) -> pd.Series:
        df_scores = df.copy()

        if self.log_transform:
            df_scores.drop(self.cat_columns, axis=1).loc[:, :] = np.log1p(
                df_scores.drop(self.cat_columns, axis=1).loc[:, :]
            )

        if isinstance(self.cat_columns, list) and isinstance(self.cat_columns, list):
            loop_cols = df.drop(self.cat_columns, axis=1).columns
        else:
            loop_cols = df.columns

        for enum, col in tqdm(enumerate(loop_cols)):
            temp_df = df
            chosen_cat_cols = self.col_intersection(
                self.cat_columns, self.chosen_columns[enum]
            )
            if isinstance(self.cat_columns, list):
                enc = self.cat_encoders[col]
                temp_df[chosen_cat_cols] = enc.transform(df[chosen_cat_cols])

            reg = self.regressors[col]

            df_scores[col] = reg.predict(df[self.chosen_columns[enum]])
            df_scores[col] = abs(df[col] - df_scores[col])
            self.regressors[col] = reg

        df_scores = self.drop_cat_columns(df_scores)

        if return_class:
            return self.make_outlier_classes(df_scores)
        else:
            return df_scores["anomaly"]

  from pandas import Int64Index as NumericIndex


# Import all data

In [3]:
transaction_data = pd.read_csv("train_transaction.csv")
identity_data = pd.read_csv("train_identity.csv")

In [4]:
transaction_data

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.50,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.00,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.00,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.00,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.00,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,3577535,0,15811047,49.00,W,6550,,150.0,visa,226.0,...,,,,,,,,,,
590536,3577536,0,15811049,39.50,W,10444,225.0,150.0,mastercard,224.0,...,,,,,,,,,,
590537,3577537,0,15811079,30.95,W,12037,595.0,150.0,mastercard,224.0,...,,,,,,,,,,
590538,3577538,0,15811088,117.00,W,7826,481.0,150.0,mastercard,224.0,...,,,,,,,,,,


In [5]:
identity_data

Unnamed: 0,TransactionID,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987004,0.0,70787.0,,,,,,,,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
1,2987008,-5.0,98945.0,,,0.0,-5.0,,,,...,mobile safari 11.0,32.0,1334x750,match_status:1,T,F,F,T,mobile,iOS Device
2,2987010,-5.0,191631.0,0.0,0.0,0.0,0.0,,,0.0,...,chrome 62.0,,,,F,F,T,T,desktop,Windows
3,2987011,-5.0,221832.0,,,0.0,-6.0,,,,...,chrome 62.0,,,,F,F,T,T,desktop,
4,2987016,0.0,7460.0,0.0,0.0,1.0,0.0,,,0.0,...,chrome 62.0,24.0,1280x800,match_status:2,T,F,T,T,desktop,MacOS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144228,3577521,-15.0,145955.0,0.0,0.0,0.0,0.0,,,0.0,...,chrome 66.0 for android,,,,F,F,T,F,mobile,F3111 Build/33.3.A.1.97
144229,3577526,-5.0,172059.0,,,1.0,-5.0,,,,...,chrome 55.0 for android,32.0,855x480,match_status:2,T,F,T,F,mobile,A574BL Build/NMF26F
144230,3577529,-20.0,632381.0,,,-1.0,-36.0,,,,...,chrome 65.0 for android,,,,F,F,T,F,mobile,Moto E (4) Plus Build/NMA26.42-152
144231,3577531,-5.0,55528.0,0.0,0.0,0.0,-7.0,,,0.0,...,chrome 66.0,24.0,2560x1600,match_status:2,T,F,T,F,desktop,MacOS


In [6]:
treatment_data = transaction_data.merge(identity_data, on=["TransactionID"], how="left")
#treatment_data = treatment_data.sample(1000)
treatment_data

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.50,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.00,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.00,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.00,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.00,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,3577535,0,15811047,49.00,W,6550,,150.0,visa,226.0,...,,,,,,,,,,
590536,3577536,0,15811049,39.50,W,10444,225.0,150.0,mastercard,224.0,...,,,,,,,,,,
590537,3577537,0,15811079,30.95,W,12037,595.0,150.0,mastercard,224.0,...,,,,,,,,,,
590538,3577538,0,15811088,117.00,W,7826,481.0,150.0,mastercard,224.0,...,,,,,,,,,,


In [7]:
treatment_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 590540 entries, 0 to 590539
Columns: 434 entries, TransactionID to DeviceInfo
dtypes: float64(399), int64(4), object(31)
memory usage: 1.9+ GB


In [8]:
target_col = "isFraud"
cat_columns = treatment_data.select_dtypes(include=["object"]).columns.to_list()
num_columns = treatment_data.drop(cat_columns+[target_col], axis=1).columns.to_list()

cols = cat_columns + num_columns

In [9]:
# simple fill missing avlues
treatment_data[num_columns] = treatment_data[num_columns].fillna(0)
treatment_data[cat_columns] = treatment_data[cat_columns].fillna("None")

In [10]:
for col in cat_columns:
    print(f"Cardinality of {col} is {treatment_data[col].nunique()}")

Cardinality of ProductCD is 5
Cardinality of card4 is 5
Cardinality of card6 is 5
Cardinality of P_emaildomain is 60
Cardinality of R_emaildomain is 61
Cardinality of M1 is 3
Cardinality of M2 is 3
Cardinality of M3 is 3
Cardinality of M4 is 4
Cardinality of M5 is 3
Cardinality of M6 is 3
Cardinality of M7 is 3
Cardinality of M8 is 3
Cardinality of M9 is 3
Cardinality of id_12 is 3
Cardinality of id_15 is 4
Cardinality of id_16 is 3
Cardinality of id_23 is 4
Cardinality of id_27 is 3
Cardinality of id_28 is 3
Cardinality of id_29 is 3
Cardinality of id_30 is 76
Cardinality of id_31 is 131
Cardinality of id_33 is 261
Cardinality of id_34 is 5
Cardinality of id_35 is 3
Cardinality of id_36 is 3
Cardinality of id_37 is 3
Cardinality of id_38 is 3
Cardinality of DeviceType is 3
Cardinality of DeviceInfo is 1787


For IF we drop DeviceInfo.

In [11]:
too_cardinal = ["DeviceInfo", "id_30", "id_31", "id_33", "P_emaildomain", "R_emaildomain"]
for col in too_cardinal:
    cat_columns.remove(col)

In [12]:
treatment_data[target_col].value_counts()

0    569877
1     20663
Name: isFraud, dtype: int64

In [13]:
treatment_data[target_col].value_counts(normalize=True)

0    0.96501
1    0.03499
Name: isFraud, dtype: float64

# Running a test against numeric and categorical data

## Statistical bagging isolation

In [14]:
import time

In [15]:
cols = treatment_data.drop("isFraud", axis=1).columns.to_list()

In [16]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(treatment_data[num_columns])
scaled = scaler.transform(treatment_data[num_columns])
scaled = pd.DataFrame(scaled, columns=num_columns)
scaled = scaled.merge(treatment_data[cat_columns], 
                      left_index=True, right_index=True, how="left")

In [17]:
scaled

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,...,id_23,id_27,id_28,id_29,id_34,id_35,id_36,id_37,id_38,DeviceType
0,0.000000,0.000000e+00,0.002137,0.743044,0.000000,0.649351,0.599156,0.583333,0.852941,0.001847,...,,,,,,,,,,
1,0.000002,6.359409e-08,0.000900,0.100885,0.673333,0.649351,0.430380,0.601852,0.852941,0.000000,...,,,,,,,,,,
2,0.000003,4.387992e-06,0.001840,0.210566,0.816667,0.649351,0.700422,0.611111,0.852941,0.027902,...,,,,,,,,,,
3,0.000005,6.295815e-06,0.001558,0.984824,0.945000,0.649351,0.493671,0.881481,0.852941,0.000000,...,,,,,,,,,,
4,0.000007,6.740974e-06,0.001558,0.201023,0.856667,0.649351,0.430380,0.777778,0.852941,0.000000,...,,,New,NotFound,match_status:2,T,F,T,T,mobile
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,0.999993,9.999947e-01,0.001526,0.319039,0.000000,0.649351,0.953586,0.503704,0.852941,0.004667,...,,,,,,,,,,
590536,0.999995,9.999948e-01,0.001229,0.542883,0.375000,0.649351,0.945148,0.377778,0.852941,0.000000,...,,,,,,,,,,
590537,0.999997,9.999967e-01,0.000961,0.634456,0.991667,0.649351,0.945148,0.427778,0.852941,0.000000,...,,,,,,,,,,
590538,0.999998,9.999973e-01,0.003656,0.392389,0.801667,0.649351,0.945148,0.716667,0.852941,0.000292,...,,,,,,,,,,


In [18]:
start = time.time()

iso_class = PSOD(cat_columns=cat_columns)

full_res = iso_class.fit_predict(scaled, return_class=True)

end = time.time()
print(f"Fully needed time is: {end - start}.")

100%|███████████████████████████████████████| 402/402 [1:31:19<00:00, 13.63s/it]


Fully needed time is: 5481.9351625442505.


In [19]:
full_res.value_counts()

0    563821
1     26719
Name: anomaly_class, dtype: int64

In [20]:
iso_class.scores.describe()

count    590540.000000
mean          0.006971
std           0.003071
min           0.001317
25%           0.004878
50%           0.006403
75%           0.008318
max           0.105606
Name: anomaly, dtype: float64

In [21]:
iso_class.scores

0         0.007490
1         0.006116
2         0.007257
3         0.009965
4         0.010164
            ...   
590535    0.006485
590536    0.003683
590537    0.004202
590538    0.006994
590539    0.006740
Name: anomaly, Length: 590540, dtype: float64

In [22]:
full_classification_report = classification_report(full_res, treatment_data[target_col])
print(full_classification_report)

              precision    recall  f1-score   support

           0       0.96      0.97      0.97    563821
           1       0.25      0.19      0.22     26719

    accuracy                           0.94    590540
   macro avg       0.60      0.58      0.59    590540
weighted avg       0.93      0.94      0.93    590540



In [23]:
matthews_corrcoef(full_res, treatment_data[target_col])

0.1848075357259221

# Performance of top 200 entries
The results here are not impacted by the param "flag_outlier_on="high end"".

In [24]:
top_df = treatment_data.copy()
top_df["linear_preds"] = iso_class.scores
top_df["linear_preds_class"] = full_res

top_df = top_df.sort_values(by=["linear_preds"], ascending=[False])
top_df = top_df.head(200)
top_df

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,linear_preds,linear_preds_class
424925,3411925,0,10763212,800.000,S,11538,111.0,150.0,visa,141.0,...,,,,,,,,,0.105606,1
424838,3411838,0,10761804,800.000,S,11538,111.0,150.0,visa,141.0,...,,,,,,,,,0.104670,1
452627,3439627,0,11560420,250.000,S,10024,321.0,150.0,visa,144.0,...,,,F,F,F,F,,,0.074039,1
30735,3017735,0,767113,125.000,S,13687,478.0,150.0,visa,195.0,...,1600x900,match_status:2,T,F,T,T,desktop,Trident/7.0,0.070923,1
540892,3527892,0,14255812,110.000,S,14705,555.0,150.0,visa,226.0,...,1366x767,match_status:2,T,F,T,T,desktop,Windows,0.059466,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445660,3432660,0,11329542,8.485,C,11803,512.0,150.0,visa,141.0,...,,,F,F,T,F,mobile,M4 SS4456 Build/LMY47V,0.028402,1
276527,3263527,0,6724037,50.000,R,6741,583.0,150.0,visa,195.0,...,,,,,,,,,0.028383,1
74616,3061616,0,1639510,100.000,R,15466,399.0,150.0,american express,150.0,...,1280x1024,match_status:2,T,F,T,T,desktop,Trident/7.0,0.028318,1
20856,3007856,1,531135,170.842,C,12616,490.0,150.0,visa,0.0,...,,,F,F,T,T,desktop,Windows,0.028310,1


In [25]:
top_df["linear_preds_class"].value_counts()

1    200
Name: linear_preds_class, dtype: int64

In [26]:
full_classification_report = classification_report(top_df["linear_preds_class"], top_df[target_col])
print(full_classification_report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.34      0.51       200

    accuracy                           0.34       200
   macro avg       0.50      0.17      0.26       200
weighted avg       1.00      0.34      0.51       200



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
matthews_corrcoef(top_df["linear_preds_class"], top_df[target_col])

0.0

In [28]:
top_df[target_col].value_counts()

0    131
1     69
Name: isFraud, dtype: int64

In [29]:
top_df[target_col].value_counts(normalize=True)*100

0    65.5
1    34.5
Name: isFraud, dtype: float64

## IsolationForest

In [30]:
pd.get_dummies(treatment_data[cols])

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,...,DeviceInfo_verykoolS5019,DeviceInfo_verykoolS5524,DeviceInfo_verykoolS5525,DeviceInfo_verykoolS5530 Build/LMY47I,DeviceInfo_verykools4009,DeviceInfo_verykools5004,DeviceInfo_verykools5034,DeviceInfo_verykools5035,DeviceInfo_vivo,DeviceInfo_xs-Z47b7VqTMxs
0,2987000,86400,68.50,13926,0.0,150.0,142.0,315.0,87.0,19.0,...,0,0,0,0,0,0,0,0,0,0
1,2987001,86401,29.00,2755,404.0,150.0,102.0,325.0,87.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,2987002,86469,59.00,4663,490.0,150.0,166.0,330.0,87.0,287.0,...,0,0,0,0,0,0,0,0,0,0
3,2987003,86499,50.00,18132,567.0,150.0,117.0,476.0,87.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,2987004,86506,50.00,4497,514.0,150.0,102.0,420.0,87.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,3577535,15811047,49.00,6550,0.0,150.0,226.0,272.0,87.0,48.0,...,0,0,0,0,0,0,0,0,0,0
590536,3577536,15811049,39.50,10444,225.0,150.0,224.0,204.0,87.0,0.0,...,0,0,0,0,0,0,0,0,0,0
590537,3577537,15811079,30.95,12037,595.0,150.0,224.0,231.0,87.0,0.0,...,0,0,0,0,0,0,0,0,0,0
590538,3577538,15811088,117.00,7826,481.0,150.0,224.0,387.0,87.0,3.0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
from sklearn.ensemble import IsolationForest

start = time.time()

clf = IsolationForest(random_state=0)
iso_outlier_classes = clf.fit_predict(pd.get_dummies(treatment_data[cols]))

end = time.time()
print(f"Fully needed time is: {end - start}.")

Fully needed time is: 525.0176777839661.


In [32]:
pd.Series(iso_outlier_classes).value_counts()

 1    572418
-1     18122
dtype: int64

In [33]:
treatment_data["isolation_forest_class"] = iso_outlier_classes

In [34]:
conditions  = [treatment_data["isolation_forest_class"] == 1, 
               treatment_data["isolation_forest_class"] == -1]
choices     = [ 0, 1 ]
    
treatment_data["isolation_forest_class"] = np.select(conditions, choices, default=np.nan)

In [35]:
full_classification_report = classification_report(treatment_data["isolation_forest_class"], treatment_data[target_col])
print(full_classification_report)

              precision    recall  f1-score   support

         0.0       0.98      0.97      0.97    572418
         1.0       0.20      0.23      0.22     18122

    accuracy                           0.95    590540
   macro avg       0.59      0.60      0.60    590540
weighted avg       0.95      0.95      0.95    590540



In [36]:
matthews_corrcoef(treatment_data["isolation_forest_class"], treatment_data[target_col])

0.1921905642135865

In [37]:
(full_res == treatment_data["isolation_forest_class"]).sum()

562501

IsolationForest has an edge over the statistical-empirical approach.

# Performance of top 200 entries

In [38]:
iso_outlier_probs = clf.score_samples(pd.get_dummies(treatment_data[cols]))
iso_outlier_probs

array([-0.3436286 , -0.32973564, -0.32921682, ..., -0.33387726,
       -0.38281361, -0.3893474 ])

In [39]:
top_df = treatment_data.copy()
top_df["if_preds"] = iso_outlier_probs

top_df["if_preds_class"] = iso_outlier_classes
conditions  = [top_df["if_preds_class"] == 1, 
               top_df["if_preds_class"] == -1]
choices     = [ 0, 1 ]
    
top_df["if_preds_class"] = np.select(conditions, choices, default=np.nan)

top_df = top_df.sort_values(by=["if_preds"], ascending=[True])
top_df = top_df.head(200)
top_df

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,isolation_forest_class,if_preds,if_preds_class
79711,3066711,0,1721957,1000.0,R,8911,533.0,150.0,visa,226.0,...,match_status:2,T,F,T,T,desktop,Windows,1.0,-0.721463,1.0
74386,3061386,0,1637444,1000.0,R,8911,533.0,150.0,visa,226.0,...,match_status:2,T,F,T,T,desktop,Windows,1.0,-0.719390,1.0
78870,3065870,0,1713764,1000.0,R,1724,583.0,150.0,visa,226.0,...,match_status:2,F,F,T,F,desktop,Windows,1.0,-0.717212,1.0
74381,3061381,0,1637377,1000.0,R,8911,533.0,150.0,visa,226.0,...,match_status:2,T,F,T,T,desktop,Windows,1.0,-0.713849,1.0
542242,3529242,0,14311543,106.0,S,15775,481.0,150.0,mastercard,102.0,...,match_status:2,T,F,T,T,desktop,Windows,1.0,-0.713395,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
560493,3547493,0,14848487,110.0,S,15775,481.0,150.0,mastercard,102.0,...,match_status:2,T,F,T,F,desktop,Windows,1.0,-0.705319,1.0
560685,3547685,0,14852521,110.0,S,15775,481.0,150.0,mastercard,102.0,...,match_status:2,T,F,T,F,desktop,Windows,1.0,-0.705319,1.0
560744,3547744,0,14853549,125.0,S,15775,481.0,150.0,mastercard,102.0,...,match_status:2,T,F,T,T,desktop,Windows,1.0,-0.705319,1.0
560745,3547745,0,14853571,125.0,S,15775,481.0,150.0,mastercard,102.0,...,match_status:2,T,F,T,T,desktop,Windows,1.0,-0.705319,1.0


In [40]:
full_classification_report = classification_report(top_df["if_preds_class"], top_df[target_col])
print(full_classification_report)

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       0.0
         1.0       0.00      0.00      0.00     200.0

    accuracy                           0.00     200.0
   macro avg       0.00      0.00      0.00     200.0
weighted avg       0.00      0.00      0.00     200.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [41]:
matthews_corrcoef(top_df["if_preds_class"], top_df[target_col])

0.0

In [42]:
top_df[target_col].value_counts()

0    200
Name: isFraud, dtype: int64

In [43]:
top_df[target_col].value_counts(normalize=True)*100

0    100.0
Name: isFraud, dtype: float64