#### import the essentail libraries

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 

#### import the df

In [2]:
df = pd.read_csv("databases/csvs/fraud_dataset.csv") 
column_to_drop = 'transaction_id'
df = df.drop(column_to_drop, axis=1) # losing a column 
df['gender'] = df['gender'].replace({'M': 0, 'F': 1}) # converting male female to 0 and 1s
df['transaction_amount'] = df['transaction_amount'].round().astype(int) # convert float to int
X = df.iloc[:, :-1].values 
y = df.iloc[:, -1].values
df

Unnamed: 0,transaction_amount,location,merchant,age,gender,fraud_label
0,1000,New York,ABC Corp,35,0,0
1,500,Chicago,XYZ Inc,45,1,0
2,2000,Los Angeles,ABC Corp,28,0,1
3,1500,San Francisco,XYZ Inc,30,1,0
4,800,Chicago,ABC Corp,50,1,0
...,...,...,...,...,...,...
81,1500,Los Angeles,XYZ Inc,31,0,0
82,2800,San Francisco,ABC Corp,50,1,1
83,1350,Chicago,XYZ Inc,28,0,0
84,920,New York,ABC Corp,47,1,0


#### data preprocess

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86 entries, 0 to 85
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   transaction_amount  86 non-null     int32 
 1   location            86 non-null     object
 2   merchant            86 non-null     object
 3   age                 86 non-null     int64 
 4   gender              86 non-null     int64 
 5   fraud_label         86 non-null     int64 
dtypes: int32(1), int64(3), object(2)
memory usage: 3.8+ KB


In [4]:
missing_data = df.isna()
missing_count = missing_data.sum()
total_missing_count = missing_data.sum().sum()
print("Missing data in each column:")
print(missing_count)
print("\nTotal missing data in the DataFrame:")
print(total_missing_count)

Missing data in each column:
transaction_amount    0
location              0
merchant              0
age                   0
gender                0
fraud_label           0
dtype: int64

Total missing data in the DataFrame:
0


In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

columns_to_encode = [1, 2]
ct = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), columns_to_encode)],remainder="passthrough")
X = np.array(ct.fit_transform(X))
X

array([[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1000, 35, 0],
       [1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 500, 45, 1],
       [0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 2000, 28, 0],
       [0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1500, 30, 1],
       [1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 800, 50, 1],
       [0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 3000, 42, 0],
       [0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1200, 55, 1],
       [0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 900, 37, 0],
       [1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 2500, 33, 1],
       [0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1800, 48, 0],
       [0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 750, 29, 1],
       [1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2200, 51, 0],
       [0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 900, 40, 1],
       [0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1600, 26, 0],
       [0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 3000, 45, 1],
       [1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1200, 34, 0],
       [0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 800, 47, 1],
       [0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1900, 32, 0],
       [0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1100, 52, 1],
       [1.0, 0.0, 

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 1)

In [8]:
from sklearn.preprocessing import StandardScaler 
sc = StandardScaler() # remember this. the goal of standardization is to have all the values of the features in the same range. -3 & +3
X_train = sc.fit_transform(X_train) 
X_test = sc.transform(X_test) 

#### testing all classifier models at once using LazyPredict

In [20]:
pip install lazypredict

Collecting lazypredict
  Downloading lazypredict-0.2.12-py2.py3-none-any.whl (12 kB)
Collecting lightgbm
  Downloading lightgbm-4.1.0-py3-none-win_amd64.whl (1.3 MB)
     ---------------------------------------- 1.3/1.3 MB 1.3 MB/s eta 0:00:00
Installing collected packages: lightgbm, lazypredict
Successfully installed lazypredict-0.2.12 lightgbm-4.1.0
Note: you may need to restart the kernel to use updated packages.


In [21]:
from lazypredict.Supervised import LazyClassifier

In [22]:
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)
models

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:03<00:00,  8.63it/s]

[LightGBM] [Info] Number of positive: 13, number of negative: 55
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000096 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 50
[LightGBM] [Info] Number of data points in the train set: 68, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.191176 -> initscore=-1.442384
[LightGBM] [Info] Start training from score -1.442384





Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DecisionTreeClassifier,1.0,1.0,1.0,1.0,0.01
ExtraTreeClassifier,1.0,1.0,1.0,1.0,0.01
AdaBoostClassifier,0.94,0.83,0.83,0.94,0.22
BaggingClassifier,0.94,0.83,0.83,0.94,0.03
XGBClassifier,0.94,0.83,0.83,0.94,0.38
SGDClassifier,0.94,0.83,0.83,0.94,0.01
RidgeClassifierCV,0.94,0.83,0.83,0.94,0.01
RidgeClassifier,0.94,0.83,0.83,0.94,0.14
RandomForestClassifier,0.94,0.83,0.83,0.94,0.15
Perceptron,0.94,0.83,0.83,0.94,0.01


#### double checking if LazyPredict is correct using GaussianNB

In [23]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

In [24]:
y_preds = classifier.predict(X_test)
print(np.concatenate((y_preds.reshape(len(y_preds),1),y_test.reshape(len(y_test),1)),1))

[[1 0]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 0]]


In [26]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
cm =confusion_matrix(y_test, y_preds)
print(cm)

acc=accuracy_score(y_test, y_preds)
print(acc)
print(f" Model Accuracy: {acc * 100:.2f}%")

[[9 6]
 [1 2]]
0.6111111111111112
 Model Accuracy: 61.11%


#### Cross Validation

In [27]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 64.52 %
Standard Deviation: 11.97 %
