### Importing Data and libs

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/bank-account-fraud-dataset-neurips-2022/Base.csv
/kaggle/input/bank-account-fraud-dataset-neurips-2022/Variant IV.csv
/kaggle/input/bank-account-fraud-dataset-neurips-2022/Variant V.csv
/kaggle/input/bank-account-fraud-dataset-neurips-2022/Variant I.csv
/kaggle/input/bank-account-fraud-dataset-neurips-2022/Variant III.csv
/kaggle/input/bank-account-fraud-dataset-neurips-2022/Variant II.csv


In [2]:
print(os.listdir('/kaggle/input/'))

['bank-account-fraud-dataset-neurips-2022']


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
dataset = pd.read_csv('/kaggle/input/bank-account-fraud-dataset-neurips-2022/Base.csv')

In [5]:
print(dataset.shape)

(1000000, 32)


In [6]:
print(dataset.columns)

Index(['fraud_bool', 'income', 'name_email_similarity',
       'prev_address_months_count', 'current_address_months_count',
       'customer_age', 'days_since_request', 'intended_balcon_amount',
       'payment_type', 'zip_count_4w', 'velocity_6h', 'velocity_24h',
       'velocity_4w', 'bank_branch_count_8w',
       'date_of_birth_distinct_emails_4w', 'employment_status',
       'credit_risk_score', 'email_is_free', 'housing_status',
       'phone_home_valid', 'phone_mobile_valid', 'bank_months_count',
       'has_other_cards', 'proposed_credit_limit', 'foreign_request', 'source',
       'session_length_in_minutes', 'device_os', 'keep_alive_session',
       'device_distinct_emails_8w', 'device_fraud_count', 'month'],
      dtype='object')


In [7]:
print(dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 32 columns):
 #   Column                            Non-Null Count    Dtype  
---  ------                            --------------    -----  
 0   fraud_bool                        1000000 non-null  int64  
 1   income                            1000000 non-null  float64
 2   name_email_similarity             1000000 non-null  float64
 3   prev_address_months_count         1000000 non-null  int64  
 4   current_address_months_count      1000000 non-null  int64  
 5   customer_age                      1000000 non-null  int64  
 6   days_since_request                1000000 non-null  float64
 7   intended_balcon_amount            1000000 non-null  float64
 8   payment_type                      1000000 non-null  object 
 9   zip_count_4w                      1000000 non-null  int64  
 10  velocity_6h                       1000000 non-null  float64
 11  velocity_24h                      1000

In [8]:
dataset.head(5)

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,...,has_other_cards,proposed_credit_limit,foreign_request,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
0,0,0.3,0.986506,-1,25,40,0.006735,102.453711,AA,1059,...,0,1500.0,0,INTERNET,16.224843,linux,1,1,0,0
1,0,0.8,0.617426,-1,89,20,0.010095,-0.849551,AD,1658,...,0,1500.0,0,INTERNET,3.363854,other,1,1,0,0
2,0,0.8,0.996707,9,14,40,0.012316,-1.490386,AB,1095,...,0,200.0,0,INTERNET,22.730559,windows,0,1,0,0
3,0,0.6,0.4751,11,14,30,0.006991,-1.863101,AB,3483,...,0,200.0,0,INTERNET,15.215816,linux,1,1,0,0
4,0,0.9,0.842307,-1,29,40,5.742626,47.152498,AA,2339,...,0,200.0,0,INTERNET,3.743048,other,0,1,0,0


### Handling Missing Numerical Data  

In [9]:
print(f'null-count = {dataset.isnull().sum().sum()} and na-count = {dataset.isna().sum().sum()} ')

null-count = 0 and na-count = 0 


In [10]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan , strategy='mean')

In [11]:
numerical_cols = dataset.select_dtypes(include = ['float64' , 'int64']).columns
print(numerical_cols)

Index(['fraud_bool', 'income', 'name_email_similarity',
       'prev_address_months_count', 'current_address_months_count',
       'customer_age', 'days_since_request', 'intended_balcon_amount',
       'zip_count_4w', 'velocity_6h', 'velocity_24h', 'velocity_4w',
       'bank_branch_count_8w', 'date_of_birth_distinct_emails_4w',
       'credit_risk_score', 'email_is_free', 'phone_home_valid',
       'phone_mobile_valid', 'bank_months_count', 'has_other_cards',
       'proposed_credit_limit', 'foreign_request', 'session_length_in_minutes',
       'keep_alive_session', 'device_distinct_emails_8w', 'device_fraud_count',
       'month'],
      dtype='object')


In [12]:
imputer.fit(dataset[numerical_cols])
dataset[numerical_cols] = imputer.transform(dataset[numerical_cols])

In [13]:
print(f'null-count = {dataset.isnull().sum().sum()} and na-count = {dataset.isna().sum().sum()} ')

null-count = 0 and na-count = 0 


### Encoding Categorical Data


In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [15]:
cat_vals = dataset.select_dtypes(include='object').columns
cat_indices = [dataset.columns.get_loc(col) for col in cat_vals]

print(cat_indices)

[8, 15, 18, 25, 27]


In [16]:
X = dataset.iloc[: , 1:].values
y = dataset.iloc[: , 0].values

In [17]:
# ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), cat_indices)], remainder='passthrough')
# X = np.array(ct.fit_transform(X))

onehotencoder = OneHotEncoder(categories = 'auto')
X = onehotencoder.fit_transform(X)

### Train and Test Dataset

In [18]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 , random_state = 1)

In [19]:
print(X_train)

  (0, 5)	1.0
  (0, 472711)	1.0
  (0, 998870)	1.0
  (0, 999287)	1.0
  (0, 999669)	1.0
  (0, 1151107)	1.0
  (0, 1992872)	1.0
  (0, 2983977)	1.0
  (0, 2984993)	1.0
  (0, 3066100)	1.0
  (0, 4385742)	1.0
  (0, 5175056)	1.0
  (0, 5986234)	1.0
  (0, 5988575)	1.0
  (0, 5988600)	1.0
  (0, 5988793)	1.0
  (0, 5989158)	1.0
  (0, 5989161)	1.0
  (0, 5989167)	1.0
  (0, 5989169)	1.0
  (0, 5989173)	1.0
  (0, 5989203)	1.0
  (0, 5989206)	1.0
  (0, 5989217)	1.0
  (0, 5989219)	1.0
  :	:
  (799999, 2125925)	1.0
  (799999, 2983980)	1.0
  (799999, 2985606)	1.0
  (799999, 3691723)	1.0
  (799999, 4298512)	1.0
  (799999, 5926073)	1.0
  (799999, 5986245)	1.0
  (799999, 5988569)	1.0
  (799999, 5988602)	1.0
  (799999, 5988833)	1.0
  (799999, 5989158)	1.0
  (799999, 5989161)	1.0
  (799999, 5989167)	1.0
  (799999, 5989168)	1.0
  (799999, 5989185)	1.0
  (799999, 5989203)	1.0
  (799999, 5989206)	1.0
  (799999, 5989217)	1.0
  (799999, 5989219)	1.0
  (799999, 6677352)	1.0
  (799999, 6984110)	1.0
  (799999, 6984114)	1.0
 

In [20]:
print(X_test)

  (0, 0)	1.0
  (0, 765805)	1.0
  (0, 998870)	1.0
  (0, 999437)	1.0
  (0, 999668)	1.0
  (0, 1290142)	1.0
  (0, 2328999)	1.0
  (0, 2983978)	1.0
  (0, 2986551)	1.0
  (0, 3581756)	1.0
  (0, 4094268)	1.0
  (0, 5684896)	1.0
  (0, 5986244)	1.0
  (0, 5988564)	1.0
  (0, 5988599)	1.0
  (0, 5988984)	1.0
  (0, 5989157)	1.0
  (0, 5989163)	1.0
  (0, 5989166)	1.0
  (0, 5989169)	1.0
  (0, 5989181)	1.0
  (0, 5989203)	1.0
  (0, 5989213)	1.0
  (0, 5989217)	1.0
  (0, 5989219)	1.0
  :	:
  (199999, 2026183)	1.0
  (199999, 2983978)	1.0
  (199999, 2987528)	1.0
  (199999, 3028851)	1.0
  (199999, 4742344)	1.0
  (199999, 5954576)	1.0
  (199999, 5986244)	1.0
  (199999, 5988568)	1.0
  (199999, 5988600)	1.0
  (199999, 5988891)	1.0
  (199999, 5989158)	1.0
  (199999, 5989161)	1.0
  (199999, 5989167)	1.0
  (199999, 5989168)	1.0
  (199999, 5989196)	1.0
  (199999, 5989203)	1.0
  (199999, 5989209)	1.0
  (199999, 5989217)	1.0
  (199999, 5989219)	1.0
  (199999, 6379104)	1.0
  (199999, 6984111)	1.0
  (199999, 6984114)	1.0
 

In [21]:
print(y_train)

[0. 0. 0. ... 0. 0. 0.]


In [22]:
print(y_test)

[0. 0. 0. ... 0. 0. 0.]


### Scaling

In [23]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean = False)

In [24]:
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [25]:
print(X_train)

  (0, 5)	3.1719199908964955
  (0, 472711)	894.4277500174342
  (0, 998870)	2.209876959644321
  (0, 999287)	10.788832420565411
  (0, 999669)	2.1602926533451443
  (0, 1151107)	894.4277500174342
  (0, 1992872)	894.4277500174342
  (0, 2983977)	2.2847394008788293
  (0, 2984993)	38.01373454791739
  (0, 3066100)	894.4277500174342
  (0, 4385742)	894.4277500174342
  (0, 5175056)	894.4277500174342
  (0, 5986234)	2.8061272717784975
  (0, 5988575)	6.328561381030241
  (0, 5988600)	2.8984543703068324
  (0, 5988793)	23.93405060287426
  (0, 5989158)	2.003672458312352
  (0, 5989161)	2.0687555572065204
  (0, 5989167)	2.028043778274169
  (0, 5989169)	3.1867213589194483
  (0, 5989173)	10.86996178049368
  (0, 5989203)	2.402479141921887
  (0, 5989206)	2.0541818158382323
  (0, 5989217)	6.375847258697159
  (0, 5989219)	11.906011481436837
  :	:
  (799999, 2125925)	632.4563226045733
  (799999, 2983980)	3.0901350494581297
  (799999, 2985606)	53.46160498213094
  (799999, 3691723)	894.4277500174342
  (799999, 42985

In [26]:
print(X_test)

  (0, 0)	2.7459227229123466
  (0, 765805)	1.0
  (0, 998870)	2.209876959644321
  (0, 999437)	19.862267746689074
  (0, 999668)	2.3219969381024836
  (0, 1290142)	1.0
  (0, 2328999)	1.0
  (0, 2983978)	2.0708965810662487
  (0, 2986551)	80.00625073251734
  (0, 3581756)	1.0
  (0, 4094268)	1.0
  (0, 5684896)	1.0
  (0, 5986244)	5.7667157665321955
  (0, 5988564)	3.6741514726214204
  (0, 5988599)	2.25313872767876
  (0, 5988984)	19.585573308288392
  (0, 5989157)	2.0036724583051617
  (0, 5989163)	2.666744036331253
  (0, 5989166)	2.028043778271613
  (0, 5989169)	3.1867213589194483
  (0, 5989181)	6.585091743909768
  (0, 5989203)	2.402479141921887
  (0, 5989213)	2.834153090218451
  (0, 5989217)	6.375847258697159
  (0, 5989219)	11.906011481436837
  :	:
  (199999, 2026183)	1.0
  (199999, 2983978)	2.0708965810662487
  (199999, 2987528)	131.87988628039307
  (199999, 3028851)	1.0
  (199999, 4742344)	1.0
  (199999, 5954576)	1.0
  (199999, 5986244)	5.7667157665321955
  (199999, 5988568)	3.992363112625713
  (

In [27]:
from sklearn.linear_model import LogisticRegression 
logistic_regressor  = LogisticRegression(max_iter=400)

In [28]:
logistic_regressor.fit(X_train , y_train)
y_pred = logistic_regressor.predict(X_test)

In [29]:
from sklearn.metrics import accuracy_score

In [30]:
log_accuracy = accuracy_score(y_test, y_pred)

print("Accuracy for Logistic Regression is :", log_accuracy)

Accuracy for Logistic Regression is : 0.98884
