# Testing outlier detection performance on credit card fraud dataset (Kaggle)
https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud

This is a real world dataset:


The dataset contains transactions made by credit cards in September 2013 by European cardholders.
This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.

It contains only numerical input variables which are the result of a PCA transformation. Unfortunately, due to confidentiality issues, we cannot provide the original features and more background information about the data. Features V1, V2, … V28 are the principal components obtained with PCA, the only features which have not been transformed with PCA are 'Time' and 'Amount'. Feature 'Time' contains the seconds elapsed between each transaction and the first transaction in the dataset. The feature 'Amount' is the transaction Amount, this feature can be used for example-dependant cost-sensitive learning. Feature 'Class' is the response variable and it takes value 1 in case of fraud and 0 otherwise.

In [1]:
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    matthews_corrcoef,
    mean_absolute_error,
    mean_squared_error,
    median_absolute_error,
    r2_score,
    recall_score,
    roc_auc_score,
)

In [2]:
import gc
from typing import Dict, List, Union, Literal

import numpy as np
import pandas as pd
from category_encoders import TargetEncoder
from sklearn.linear_model import LinearRegression
from tqdm import tqdm


class PSOD:
    """
    Get outlier predictions using a pseudo-supervised approach.

    :param n_jobs: Used to determine number of cores used for LinearRegression. Check sklearn documentation for details.
    :param cat_columns: None if no categorical features are present. Otherwise list specifying column names of
                          categorical features.
    :param min_cols_chosen: Float specifying the minimum percentage of columns to be used for each regressor.
    :param max_cols_chosen: Float specifying the maximum percentage of columns to be used for each regressor.
    :param stdevs_to_outlier: Float specifying after how many standard deviations the mean prediction error will be
                              flagged as an outlier.
    :param log_transform: Boolean to set if the numerical data will be log-transformed.
    :param random_seed: Int specifying the start random_seed. Each additional iteration will use a different seed.
    :param flag_outlier_on: String indicating if outliers shall we errors that are on the top end, bottom end or
                            both ends of the mean error distribution. Must be any of ["low end", "both ends", "high end"]
    """
    def __init__(
            self,
            n_jobs=-1,
            cat_columns: Union[List[str], List[int], List[float], None] = None,
            min_cols_chosen: float = 0.5,
            max_cols_chosen: float = 1.0,
            stdevs_to_outlier: float = 1.96,
            log_transform: bool = True,
            random_seed: int = 1,
            flag_outlier_on: Literal["low end", "both ends", "high end"] = "both ends"
    ):
        self.cat_columns = cat_columns
        self.cat_encoders: Dict[Union[str, int, float], TargetEncoder] = {}
        self.regressors: Dict[Union[str, int, float], LinearRegression] = {}
        self.n_jobs = n_jobs
        self.scores: Union[pd.Series, None] = None
        self.outlier_classes = Union[pd.Series, None]
        self.min_cols_chosen = min_cols_chosen
        self.max_cols_chosen = max_cols_chosen
        self.chosen_columns: List[list] = []
        self.stdevs_to_outlier = stdevs_to_outlier
        self.log_transform = log_transform
        self.flag_outlier_on = flag_outlier_on
        self.random_seed = random_seed
        self.random_generator = np.random.default_rng(self.random_seed)

        if self.max_cols_chosen > 1.0:
            raise ValueError("Param max_cols_chosen cannot be higher than 1.")

        if self.min_cols_chosen <= 0:
            raise ValueError("Param min_cols_chosen must be higher than 0.")

        if self.min_cols_chosen > self.max_cols_chosen:
            raise ValueError("Param min_cols_chosen cannot be higher than param max_cols_chosen.")

        if self.flag_outlier_on not in ["low end", "both ends", "high end"]:
            raise ValueError('Param flag_outlier_on must be any of ["low end", "both ends", "high end"].')

    def __str__(self):
        message = f"""
        Most important params specified are:
        - n_jobs: {self.n_jobs}
        - cat_columns: {self.cat_columns}
        - min_cols_chosen: {self.min_cols_chosen}
        - max_cols_chosen: {self.max_cols_chosen}
        - stdevs_to_outlier: {self.stdevs_to_outlier}
        - log_transform: {self.log_transform}
        - random_seed: {self.random_seed}
        - flag_outlier_on: {self.flag_outlier_on}
        """
        return message

    def get_range_cols(self, df):
        len_cols = len(df.columns) - 1  # taking out the "target" column
        self.min_cols_chosen: int = max(int(len_cols * self.min_cols_chosen), 1)
        self.max_cols_chosen: int = min(int(len_cols * self.max_cols_chosen), len_cols)

    def chose_random_columns(self, df) -> list:
        """
        Select random columns.

        Randomize number of columns to chose from as well as the columns chosen.
        :return: list object with chosen column names
        """
        nb_cols: int = self.random_generator.choice(
            np.arange(self.min_cols_chosen, self.max_cols_chosen) + 1, 1, replace=False
        )
        return self.random_generator.choice(df.columns, nb_cols, replace=False).tolist()

    def col_intersection(self, lst1, lst2) -> list:
        chosen_cat_cols = [value for value in lst1 if value in lst2]
        return chosen_cat_cols

    def make_outlier_classes(self, df_scores: pd.DataFrame):
        mean_score = df_scores["anomaly"].mean()
        std_score = df_scores["anomaly"].std()

        if self.flag_outlier_on == "both ends":
            conditions = [
                df_scores["anomaly"] < mean_score - self.stdevs_to_outlier * std_score,
                df_scores["anomaly"] > mean_score + self.stdevs_to_outlier * std_score
            ]
        elif self.flag_outlier_on == "low end":
            conditions = [
                df_scores["anomaly"] < mean_score - self.stdevs_to_outlier * std_score
            ]
        elif self.flag_outlier_on == "high end":
            conditions = [
                df_scores["anomaly"] > mean_score + self.stdevs_to_outlier * std_score
            ]
        else:
            raise ValueError('Param flag_outlier_on must be any of ["low end", "both ends", "high end"].')

        choices = [1 for i in conditions]
        df_scores["anomaly_class"] = np.select(conditions, choices, default=0)
        self.outlier_classes = df_scores["anomaly_class"]
        return df_scores["anomaly_class"]

    def drop_cat_columns(self, df_scores: pd.DataFrame) -> pd.DataFrame:
        if isinstance(self.cat_columns, list):
            df_scores["anomaly"] = df_scores.drop(self.cat_columns, axis=1).mean(axis=1)
        else:
            df_scores["anomaly"] = df_scores.mean(axis=1)

        self.scores = df_scores["anomaly"]
        return df_scores

    def fit_predict(self, df, return_class=False) -> pd.Series:
        df_scores = df.copy()
        self.get_range_cols(df)
        if isinstance(self.cat_columns, list):
            loop_cols = df.drop(self.cat_columns, axis=1).columns
        else:
            loop_cols = df.columns

        if self.log_transform and isinstance(self.cat_columns, list):
            df.drop(self.cat_columns, axis=1).loc[:, :] = np.log1p(
                df.drop(self.cat_columns, axis=1).loc[:, :]
            )

        for enum, col in tqdm(enumerate(loop_cols), total=len(loop_cols)):
            self.chosen_columns.append(self.chose_random_columns(df.drop(col, axis=1)))
            temp_df = df.copy()
            # encode categorical columns that are in chosen columns
            if isinstance(self.cat_columns, list):
                chosen_cat_cols = self.col_intersection(
                    self.cat_columns, self.chosen_columns[enum]
                )

            idx = df_scores.sample(frac=1.0, random_state=enum, replace=True).index

            if isinstance(self.cat_columns, list):
                enc = TargetEncoder(cols=chosen_cat_cols)
                temp_df.loc[:, chosen_cat_cols] = enc.fit_transform(
                    df.loc[:, chosen_cat_cols].iloc[idx].reset_index(drop=True),
                    df.loc[:, col].iloc[idx].reset_index(drop=True),
                )

            reg = LinearRegression(n_jobs=self.n_jobs).fit(
                temp_df.loc[:, self.chosen_columns[enum]].iloc[idx],
                temp_df[col].iloc[idx],
            )
            df_scores[col] = reg.predict(temp_df.loc[:, self.chosen_columns[enum]])
            df_scores[col] = abs(temp_df[col] - df_scores[col])
            self.regressors[col] = reg
            if isinstance(self.cat_columns, list):
                self.cat_encoders[col] = enc
            del temp_df
            _ = gc.collect()

        df_scores = self.drop_cat_columns(df_scores)

        if return_class:
            return self.make_outlier_classes(df_scores)
        else:
            return df_scores["anomaly"]

    def predict(self, df, return_class=False) -> pd.Series:
        df_scores = df.copy()

        if self.log_transform:
            df_scores.drop(self.cat_columns, axis=1).loc[:, :] = np.log1p(
                df_scores.drop(self.cat_columns, axis=1).loc[:, :]
            )

        if isinstance(self.cat_columns, list) and isinstance(self.cat_columns, list):
            loop_cols = df.drop(self.cat_columns, axis=1).columns
        else:
            loop_cols = df.columns

        for enum, col in tqdm(enumerate(loop_cols)):
            temp_df = df
            chosen_cat_cols = self.col_intersection(
                self.cat_columns, self.chosen_columns[enum]
            )
            if isinstance(self.cat_columns, list):
                enc = self.cat_encoders[col]
                temp_df[chosen_cat_cols] = enc.transform(df[chosen_cat_cols])

            reg = self.regressors[col]

            df_scores[col] = reg.predict(df[self.chosen_columns[enum]])
            df_scores[col] = abs(df[col] - df_scores[col])
            self.regressors[col] = reg

        df_scores = self.drop_cat_columns(df_scores)

        if return_class:
            return self.make_outlier_classes(df_scores)
        else:
            return df_scores["anomaly"]

  from pandas import Int64Index as NumericIndex


# Numeric features only

In [3]:
treatment_data = pd.read_csv("creditcard.csv")

In [4]:
treatment_data

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [5]:
treatment_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

# Running a test against numeric data only

## Statistical bagging isolation

In [6]:
cols = treatment_data.drop("Class", axis=1).columns.to_list()

In [7]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(treatment_data[cols])
scaled = scaler.transform(treatment_data[cols])
scaled = pd.DataFrame(scaled, columns=cols)

In [8]:
scaled

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,0.000000,0.935192,0.766490,0.881365,0.313023,0.763439,0.267669,0.266815,0.786444,0.475312,...,0.582942,0.561184,0.522992,0.663793,0.391253,0.585122,0.394557,0.418976,0.312697,0.005824
1,0.000000,0.978542,0.770067,0.840298,0.271796,0.766120,0.262192,0.264875,0.786298,0.453981,...,0.579530,0.557840,0.480237,0.666938,0.336440,0.587290,0.446013,0.416345,0.313423,0.000105
2,0.000006,0.935217,0.753118,0.868141,0.268766,0.762329,0.281122,0.270177,0.788042,0.410603,...,0.585855,0.565477,0.546030,0.678939,0.289354,0.559515,0.402727,0.415489,0.311911,0.014739
3,0.000006,0.941878,0.765304,0.868484,0.213661,0.765647,0.275559,0.266803,0.789434,0.414999,...,0.578050,0.559734,0.510277,0.662607,0.223826,0.614245,0.389197,0.417669,0.314371,0.004807
4,0.000012,0.938617,0.776520,0.864251,0.269796,0.762975,0.263984,0.268968,0.782484,0.490950,...,0.584615,0.561327,0.547271,0.663392,0.401270,0.566343,0.507497,0.420561,0.317490,0.002724
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,0.999965,0.756448,0.873531,0.666991,0.160317,0.729603,0.236810,0.235393,0.863749,0.528729,...,0.595979,0.564920,0.515249,0.680500,0.313600,0.658558,0.466291,0.433929,0.329840,0.000030
284803,0.999971,0.945845,0.766677,0.872678,0.219189,0.771561,0.273661,0.265504,0.788548,0.482925,...,0.580900,0.564933,0.553153,0.665619,0.245298,0.543855,0.360884,0.417775,0.312038,0.000965
284804,0.999977,0.990905,0.764080,0.781102,0.227202,0.783425,0.293496,0.263547,0.792985,0.477677,...,0.580280,0.565220,0.537005,0.664877,0.468492,0.592824,0.411177,0.416593,0.312585,0.002642
284805,0.999977,0.954209,0.772856,0.849587,0.282508,0.763172,0.269291,0.261175,0.792671,0.476287,...,0.581622,0.565755,0.547353,0.663008,0.398836,0.545958,0.514746,0.418520,0.315245,0.000389


In [9]:
iso_class = PSOD(flag_outlier_on="high end")

full_res = iso_class.fit_predict(scaled, return_class=True)

100%|███████████████████████████████████████████| 30/30 [00:07<00:00,  4.06it/s]


In [10]:
full_res.value_counts()

0    275943
1      8864
Name: anomaly_class, dtype: int64

In [11]:
iso_class.scores.describe()

count    284807.000000
mean          0.028099
std           0.009950
min           0.009463
25%           0.021738
50%           0.026432
75%           0.032553
max           0.439800
Name: anomaly, dtype: float64

In [12]:
iso_class.scores

0         0.025198
1         0.023218
2         0.038020
3         0.041756
4         0.038711
            ...   
284802    0.063036
284803    0.047897
284804    0.027447
284805    0.050189
284806    0.022823
Name: anomaly, Length: 284807, dtype: float64

In [13]:
full_classification_report = classification_report(treatment_data["Class"], full_res)
print(full_classification_report)

              precision    recall  f1-score   support

           0       1.00      0.97      0.98    284315
           1       0.05      0.81      0.09       492

    accuracy                           0.97    284807
   macro avg       0.52      0.89      0.54    284807
weighted avg       1.00      0.97      0.98    284807



In [14]:
matthews_corrcoef(treatment_data["Class"], full_res)

0.18730605094543587

In [15]:
# 0.187

# Performance of top 200 entries
The results here are not impacted by the param "flag_outlier_on="high end"".

In [16]:
top_df = treatment_data.copy()
top_df["linear_preds"] = iso_class.scores
top_df["linear_preds_class"] = full_res

top_df = top_df.sort_values(by=["linear_preds"], ascending=[False])
top_df = top_df.head(200)
top_df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V23,V24,V25,V26,V27,V28,Amount,Class,linear_preds,linear_preds_class
274771,166198.0,-35.548539,-31.850484,-48.325589,15.304184,-113.743307,73.301626,120.589494,-27.347360,-3.872425,...,-1.581098,4.584549,4.554683,3.415636,31.612198,-15.430084,25691.16,0,0.439800,1
224569,143863.0,-41.928738,-40.803981,-22.588759,11.844703,-8.729597,3.281296,14.203189,-5.245286,4.874533,...,-21.303666,1.321750,-4.084350,-0.601799,-2.787339,-0.816680,200.00,0,0.271540,1
220212,142071.0,-46.855047,-45.607973,-18.320903,13.129143,0.775717,-2.695834,1.087461,-4.241002,6.645063,...,-27.533643,2.023816,-3.856097,-0.478302,-4.139269,13.999688,303.80,0,0.264228,1
206255,136137.0,-40.042537,-38.430842,-21.277176,10.527243,-16.296090,8.799515,19.553200,-6.221785,6.121324,...,-20.034848,1.575385,-4.196468,-0.851794,0.375152,-1.178134,1676.60,0,0.250806,1
234519,148008.0,-40.470142,-37.520432,-17.474421,11.427809,-4.336937,0.456246,2.113722,-6.395106,4.252881,...,-20.757358,0.810500,-3.477981,-0.608199,2.479591,-1.133320,157.43,0,0.235900,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173054,121340.0,-22.132223,-19.815536,-11.183644,4.829787,-13.128465,10.689779,18.257057,-8.685409,5.250410,...,-18.416215,1.376122,-2.965101,0.149369,4.077221,-1.975801,4543.64,0,0.137785,1
199710,133099.0,-29.046615,-32.362448,-3.208013,11.332812,15.848810,-8.426268,-2.832795,-2.610682,3.260821,...,16.722816,0.695778,4.381129,0.654957,7.126343,-2.679944,0.00,0,0.137484,1
236429,148806.0,-33.669917,-47.429676,-7.198018,10.055906,29.016124,-20.054615,-18.381781,1.799471,1.663887,...,-3.806103,-0.072090,-1.885646,-0.448436,0.765828,-1.794908,152.00,0,0.135971,1
284249,172273.0,-9.030538,-11.112584,-16.233798,3.592021,-40.427726,23.917837,44.054461,-7.277778,-4.210637,...,7.040028,0.347693,2.520869,2.342495,3.478175,-2.713136,10199.44,0,0.134592,1


In [17]:
full_classification_report = classification_report(top_df["Class"], top_df["linear_preds_class"])
print(full_classification_report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        87
           1       0.56      1.00      0.72       113

    accuracy                           0.56       200
   macro avg       0.28      0.50      0.36       200
weighted avg       0.32      0.56      0.41       200



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
matthews_corrcoef(top_df["Class"], top_df["linear_preds_class"])

0.0

In [19]:
top_df["Class"].value_counts()

1    113
0     87
Name: Class, dtype: int64

In [20]:
top_df["Class"].value_counts(normalize=True)*100

1    56.5
0    43.5
Name: Class, dtype: float64

In [21]:
# 56.5

## IsolationForest

In [22]:
from sklearn.ensemble import IsolationForest
clf = IsolationForest(random_state=0)
iso_outlier_classes = clf.fit_predict(treatment_data[cols])

In [23]:
pd.Series(iso_outlier_classes).value_counts()

 1    273155
-1     11652
dtype: int64

In [24]:
treatment_data["isolation_forest_class"] = iso_outlier_classes

In [25]:
conditions  = [treatment_data["isolation_forest_class"] == 1, 
               treatment_data["isolation_forest_class"] == -1]
choices     = [ 0, 1 ]
    
treatment_data["isolation_forest_class"] = np.select(conditions, choices, default=np.nan)

In [26]:
full_classification_report = classification_report(treatment_data["Class"], treatment_data["isolation_forest_class"])
print(full_classification_report)

              precision    recall  f1-score   support

           0       1.00      0.96      0.98    284315
           1       0.04      0.83      0.07       492

    accuracy                           0.96    284807
   macro avg       0.52      0.90      0.52    284807
weighted avg       1.00      0.96      0.98    284807



In [27]:
matthews_corrcoef(treatment_data["Class"], treatment_data["isolation_forest_class"])

0.16641202829857646

In [28]:
(full_res == treatment_data["isolation_forest_class"]).sum()

273843

IsolationForest has an edge over the statistical-empirical approach.

# Performance of top 200 entries

In [29]:
iso_outlier_probs = clf.score_samples(treatment_data[cols])
iso_outlier_probs

array([-0.40783781, -0.38567443, -0.47974649, ..., -0.40197462,
       -0.4438817 , -0.3927196 ])

In [30]:
top_df = treatment_data.copy()
top_df["if_preds"] = iso_outlier_probs

top_df["if_preds_class"] = iso_outlier_classes
conditions  = [top_df["if_preds_class"] == 1, 
               top_df["if_preds_class"] == -1]
choices     = [ 0, 1 ]
    
top_df["if_preds_class"] = np.select(conditions, choices, default=np.nan)

top_df = top_df.sort_values(by=["if_preds"], ascending=[True])
top_df = top_df.head(200)
top_df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V24,V25,V26,V27,V28,Amount,Class,isolation_forest_class,if_preds,if_preds_class
274771,166198.0,-35.548539,-31.850484,-48.325589,15.304184,-113.743307,73.301626,120.589494,-27.347360,-3.872425,...,4.584549,4.554683,3.415636,31.612198,-15.430084,25691.16,0,1.0,-0.748017,1.0
231454,146772.0,-35.905105,-31.041362,-19.472908,9.216960,-18.863553,10.713326,16.687265,-9.609234,3.296559,...,0.987774,-2.180899,-0.172480,6.228140,-2.830091,3552.96,0,1.0,-0.734688,1.0
74699,55709.0,-16.950064,-16.417395,-12.523381,6.555638,-27.752964,18.072031,28.504065,-10.152220,2.124673,...,1.550407,-0.502172,0.821714,12.152401,-4.009839,8790.26,0,1.0,-0.732599,1.0
173353,121450.0,-28.262775,-26.551515,-15.930586,6.945207,-19.203497,13.461737,23.718783,-9.419314,5.264773,...,1.771003,-3.684737,-0.106886,4.071877,-2.383081,4861.64,0,1.0,-0.731445,1.0
220090,142019.0,-37.558067,-29.335992,-24.962322,8.413616,-35.182120,21.550496,36.877368,-8.915982,5.046566,...,1.272716,1.965365,0.767817,3.248189,13.013546,5964.95,0,1.0,-0.727235,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14920,26217.0,-17.950631,11.067069,-20.742660,6.075531,-13.389765,-4.532888,-15.188146,12.101062,-4.026880,...,0.102040,1.177477,-0.238730,1.554463,0.547948,1.00,0,1.0,-0.676816,1.0
42590,41164.0,-5.932778,4.571743,-9.427247,6.577056,-6.115218,-3.661798,-10.894079,3.709210,-5.859524,...,0.384430,-0.077884,0.565493,1.792012,0.371007,5.30,1,1.0,-0.676668,1.0
43204,41413.0,-15.140450,7.378042,-16.356367,9.194935,-13.466163,-2.958431,-16.165539,10.075254,-7.901821,...,0.795190,-0.194542,0.145964,-2.458680,-1.189888,106.55,1,1.0,-0.676423,1.0
231166,146640.0,-16.292995,-10.123192,-4.934792,1.360328,-3.172779,2.569251,7.520645,-4.684583,5.300822,...,0.909250,0.169238,-0.375383,-6.360944,2.165846,912.00,0,1.0,-0.676387,1.0


In [31]:
full_classification_report = classification_report(top_df["Class"], top_df["if_preds_class"])
print(full_classification_report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       161
           1       0.20      1.00      0.33        39

    accuracy                           0.20       200
   macro avg       0.10      0.50      0.16       200
weighted avg       0.04      0.20      0.06       200



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
matthews_corrcoef(top_df["Class"], top_df["if_preds_class"])

0.0

In [33]:
top_df["Class"].value_counts()

0    161
1     39
Name: Class, dtype: int64

In [34]:
top_df["Class"].value_counts(normalize=True)*100

0    80.5
1    19.5
Name: Class, dtype: float64