$\huge \text{Credit card fraud detection data set:}$

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from timeit import default_timer as timer

In [None]:
def compute_Q(A):
  n=len(A) # number of data points
  Q_matrix=np.matmul(A.T,A)/n
  return Q_matrix

In [None]:
iteration_num=0
tolerance=1
matrix_decomposition=2
power_method=3

In [None]:
def compute_principal_direc_power(A, tol=1e-6, max_iter=1000):
  n = A.shape[0]
  x = np.ones(n) / np.sqrt(n)
  delta = 1

  for i in range(max_iter):
      x_new = A @ x
      x_new = x_new / np.linalg.norm(x_new)
      delta = np.linalg.norm(x_new - x)
      x = x_new

      if delta < tol:
          break

  return x

In [None]:

def compute_cosine(a,b):
  assert type(a) is np.ndarray and type(b) is np.ndarray
  return (np.dot(a,b))/(np.linalg.norm(a)*np.linalg.norm(b))

In [None]:
def compute_thres(score,k=2):
  a=np.array(score)
  return  np.mean(a)-k*np.std(a), np.mean(a)+k*np.std(a)

In [None]:
# let's define a seed value
seed=17

In [None]:
def guass_2d(mu,std,size):
  data=[]
  np.random.seed(seed)
  for i in range(size):
    x=np.random.normal(mu,std)
    y=np.random.normal(mu,std)
    data.append(x)
    data.append(y)
  return pd.DataFrame(np.array(data).reshape(size,2))

In [None]:
def dchange(x):
  return np.array(list(x))

$\large \text{PCAOD: Principal Component Analysis Otlier Detection without Oversampling with LOO(leave one out) and with or without strategy}$

In [None]:
def PCAOD(data):
  start=timer()
  outlier_score=[]
  data_score={}
  Q_mat_init=compute_Q(np.array(data))
  init_dir=compute_principal_direc_power(Q_mat_init)
  for i in range(len(data)):

    data_rem=data.drop(i)

    Q_mat=compute_Q(np.array(data_rem))
    dir=compute_principal_direc_power(Q_mat)

    cosine_val=compute_cosine(init_dir,dir)

    outlier_scr=1-abs(cosine_val)

    outlier_score.append(outlier_scr)
    data_score[tuple(data.iloc[i])]=outlier_scr

  end=timer()

  return outlier_score,end-start






$\large \text{Over-sampling principal Component Analysis outlier detection(used oversampling technique)}$

In [None]:
def OPCAOD(dat,r=0.1):
  start=timer()
  n=len(dat) # number of data points
  dim=dat.shape[1]
  #data_score={}
  Q_mat_init=compute_Q(np.array(dat))
  mu_init=np.array(dat.mean())
  prin_dir_init=compute_principal_direc_power(Q_mat_init)
  #prin_dir_list=[]
  #cosine_sim_list=[]
  outlier_score_list=[]
  m=(1+r)
  for i in range(len(dat)):
    mu=((mu_init)+r*np.array(dat.iloc[i]))/m

    data_point=np.array(dat.iloc[i])
    cov_mat=(Q_mat_init/m)+((r/m)*np.outer(data_point,data_point))-np.outer(mu,mu)
    # finding principal direction
    prin_dir=compute_principal_direc_power(cov_mat)
    #prin_dir_list.append(prin_dir)
    # finding cosine similiraty (cos theta)
    cosine_sim=compute_cosine( prin_dir_init,prin_dir)
    #cosine_sim_list.append(cosine_sim)
    # h
    outlier_score=(1-abs(cosine_sim))
    outlier_score_list.append(outlier_score)
    #data_score[tuple(dat.iloc[i])]=outlier_score


  end=timer()
  #Sc=np.array(outlier_score_list)
  #mean=Sc.mean()
  #outlier_score_list=abs(Sc-mean)

  return Q_mat_init,mu_init,prin_dir_init,outlier_score_list,end-start

In [None]:
def online_anomaly(dat,Q_mat_init,mu_init,prin_dir_init,thres,r=0.1):
  m=1+r
  pred=[]
  for i in range(len(dat)):
    mu=((mu_init)+r*np.array(dat.iloc[i]))/m

    data_point=np.array(dat.iloc[i])
    cov_mat=(Q_mat_init/m)+((r/m)*np.outer(data_point,data_point))-np.outer(mu,mu)
    # finding principal direction
    prin_dir=compute_principal_direc_power(cov_mat)
    #prin_dir_list.append(prin_dir)
    # finding cosine similiraty (cos theta)
    cosine_sim=compute_cosine( prin_dir_init,prin_dir)
    #cosine_sim_list.append(cosine_sim)
    # h
    outlier_score=(1-abs(cosine_sim))

    if outlier_score<=thres[0] or outlier_score>=thres[1]:
      pred.append(1)
    else:
      pred.append(0)
    #data_score[tuple(dat.iloc[i])]=outlier_score
  return pred



$\huge \text{Credit Card Fraud Detection Dataset:}$

In [None]:
data=pd.read_csv('creditcard.csv')

ParserError: ignored

In [None]:
data.shape

In [None]:
data.info()

$\text{Since time columns is just showing the time of transaction if we initiate time with first transacton.So we decide to drop this column.}$

$\large \text{we consider other columns as features and class column as label of data point.}$

In [None]:
data=data.drop(['Time'],axis=1)
data

$\large \text{Last row has some missing values so we drop this row also.}$

In [None]:
data=data.drop([len(data)-1])
data

In [None]:
data['Class'].value_counts()

In [None]:
data.info()

In [None]:
data_X=data[data.columns[:-1]]
data_y=data[data.columns[-1]]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.2, random_state=34)

In [None]:
train_data=pd.concat([X_train,y_train],axis=1)
data_norm=train_data[train_data['Class']==0][train_data.columns[:-1]]



In [None]:
Q_mat_init,mu_init,prin_dir_init,outlier_score_list,time=OPCAOD(data_norm,r=0.1)

In [None]:
thr1,thr2=compute_thres(outlier_score_list)
thr=[thr1,thr2]

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
print(len(X_test))

In [None]:
pred=online_anomaly(X_test,Q_mat_init,mu_init,prin_dir_init,thr,r=0.1)

In [None]:
y_pred=np.array(pred)
y_actual=np.array(y_test)
acc=np.sum(y_pred==y_actual)/len(y_pred)
print('accuracy is :',acc)

In [None]:
c_matrix=confusion_matrix(y_actual,y_pred)

In [None]:
A=c_matrix
TPR=A[0,0]/(A[0,0]+A[1,0])
FPR=A[0,1]/(A[0,1]+A[1,0])
print('True positive rate (TPR):',TPR,'And False Positive rate :',FPR)

In [None]:
A