## Importing libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import pickle
from joblib import dump,load
import os
import warnings
warnings.filterwarnings('ignore')

### Reading the Dataset

In [2]:
df=pd.read_csv("Dataset/onlinefraud.csv")

In [3]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


### checking for null values

In [4]:
df.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

## Encoding

In [5]:
type_new = pd.get_dummies(df['type'], drop_first=True)
df_new = pd.concat([df, type_new], axis=1)

In [6]:
output_directory = 'encoder/'
os.makedirs(output_directory, exist_ok=True)
encoder_filename = os.path.join(output_directory, 'df_new.pkl')
dump(df_new,encoder_filename)


['encoder/df_new.pkl']

## Dropping the unnecessary columns

In [7]:
X = df_new.drop(['isFraud', 'type', 'nameOrig', 'nameDest'], axis=1)
y = df_new['isFraud']

In [8]:
X.head()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud,CASH_OUT,DEBIT,PAYMENT,TRANSFER
0,1,9839.64,170136.0,160296.36,0.0,0.0,0,False,False,True,False
1,1,1864.28,21249.0,19384.72,0.0,0.0,0,False,False,True,False
2,1,181.0,181.0,0.0,0.0,0.0,0,False,False,False,True
3,1,181.0,181.0,0.0,21182.0,0.0,0,True,False,False,False
4,1,11668.14,41554.0,29885.86,0.0,0.0,0,False,False,True,False


## Spliting the data

In [9]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [10]:
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

X_train.to_csv("Data/X_train.csv",index=False)
X_test.to_csv("Data/X_test.csv",index=False)
X_validation.to_csv("Data/X_val.csv",index=False)
y_train.to_csv("Data/y_train.csv",index=False)
X_test.to_csv("Data/X_train.csv",index=False)
y_validation.to_csv("Data/y_val.csv",index=False)

## Scaling

In [11]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
output_directory = 'scaling/'
os.makedirs(output_directory, exist_ok=True)
scaler_filename = os.path.join(output_directory, 'scaler.pkl')
dump(scaler, scaler_filename)

['scaling/scaler.pkl']

### XGBOOST

In [13]:
XGB = XGBClassifier()
XGB.fit(X_train, y_train)
XGB.score(X_train,y_train)

0.9998605695677028

In [14]:
with open('models/xgb_model.pkl', 'wb') as file:
    pickle.dump(XGB, file)

## Logistic Regression

In [15]:
LR = LogisticRegression()
LR.fit(X_train,y_train)
LR.score(X_train,y_train)

0.9985080270167231

In [16]:
LR.score(X_test,y_test)

0.998553006989783

In [17]:
with open('models/logistic_regression_model.pkl', 'wb') as file:
    pickle.dump(LR, file)

## SVC

In [18]:
sample_size = 300000
num_rows, num_cols = X_train.shape
if sample_size > num_rows:
    sample_size = num_rows

sample_indices = np.random.choice(num_rows, sample_size, replace=False)
X_train_sampled = X_train.iloc[sample_indices]  # Select rows by indices
y_train_sampled = y_train.iloc[sample_indices]

svc_model = SVC(kernel='rbf', probability=True)
svc_model.fit(X_train_sampled, y_train_sampled)
svc_model.score(X_train,y_train)

0.99897010081651

In [19]:
with open('models/svc_model.pkl', 'wb') as file:
    pickle.dump(svc_model, file)

## Evaluation

In [1]:
from evaluation import evaluation_pipeline

In [3]:
_, score = evaluation_pipeline('Data/X_val.csv', 'Data/y_val.csv','models/logistic_regression_model.pkl')
score

99.16135177018273

In [4]:
_, score = evaluation_pipeline('Data/X_val.csv', 'Data/y_val.csv','models/xgb_model.pkl')
score

56.68849205725523

In [5]:
_, score = evaluation_pipeline('Data/X_val.csv', 'Data/y_val.csv','models/svc_model.pkl')
score

99.87143660944704