Skip to content

Credit Card Fraud Detection algorithm using smote , confusion matrix, correlation matrix, density plots and ROC-AUC curve . Model - Logistic Regression, Knn, Isolation Forest

Notifications You must be signed in to change notification settings

Sudev18/credit-card

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

7 Commits
 
 
 
 

Repository files navigation

credit-card

Credit Card Fraud Detection algorithm using smote , confusion matrix, correlation matrix, density plots and ROC-AUC curve #!/usr/bin/env python3

-- coding: utf-8 --

#!/usr/bin/env python3

-- coding: utf-8 --

#!/usr/bin/env python3

-- coding: utf-8 --

""" Created on Thu Dec 10 17:02:55 2020

@author: sudevpradhan CREDIT CARD FRAUD DETECTION Data sets from https://www.kaggle.com/mlg-ulb/creditcardfraud """

import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O import matplotlib.pyplot as plt import seaborn as sns sns.set() from sklearn.model_selection import GridSearchCV from sklearn.linear_model import LogisticRegression from sklearn.metrics import confusion_matrix, auc, roc_curve from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from imblearn.over_sampling import SMOTE from sklearn.metrics import plot_confusion_matrix from mpl_toolkits.mplot3d import Axes3D from sklearn.preprocessing import StandardScaler from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, roc_curve, accuracy_score, precision_recall_curve, classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import IsolationForest

#from xgboost import XGBClassifier

Other Libraries

from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline from imblearn.under_sampling import TomekLinks from imblearn.under_sampling import NearMiss from imblearn.metrics import classification_report_imbalanced from sklearn.pipeline import make_pipeline from sklearn.neighbors import LocalOutlierFactor from sklearn.svm import OneClassSVM from sklearn.utils import resample from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, KFold, StratifiedKFold, train_test_split from sklearn.manifold import TSNE from sklearn.decomposition import PCA, TruncatedSVD from sklearn.model_selection import StratifiedShuffleSplit as sss import warnings warnings.filterwarnings("ignore")

#Load the Dataset data = pd.read_csv('creditcard.csv')

df = pd.read_csv('creditcard.csv') print(df.shape) df.head()

df.info()

df.describe()

class_names = {0:'Not Fraud', 1:'Fraud'} print(df.Class.value_counts().rename(index = class_names))

fig = plt.figure(figsize = (15, 12))

def imbalance():

#Print the value counts of frauds and non-frauds in the data
print(data['Class'].value_counts())

#Calculate the percentage of Fraud and Non-fraud transactions.
print('Valid Transactions: ', round(data['Class'].value_counts()[0]/len(data) * 100,2)
      , '% of the dataset')

print('Fraudulent Transactions: ', round(data['Class'].value_counts()[1]/len(data) * 100,2)
      , '% of the dataset')

#Visualizing the class Imbalance
colors = ['blue','red']
sns.countplot('Class', data=data, palette=colors)

def data_visualisation(): #Distribution of the amount and time in the data sets fig, ax = plt.subplots(1, 2, figsize=(16,4)) fig.suptitle('Distribution', fontsize=16) amount_val = df['Amount'].values time_val = df['Time'].values colors = ["#0101DF", "#DF0101"] sns.distplot(amount_val, ax=ax[0], color=colors[0]) ax[0].set_title('Distribution of Transaction Amount') ax[0].set_xlim([min(amount_val), max(amount_val)])

sns.distplot(time_val, ax=ax[1], color=colors[1])
ax[1].set_title('Distribution of Transaction Time')
ax[1].set_xlim([min(time_val), max(time_val)])
plt.show()

#fraud amount and non fraud amount
fig, ax = plt.subplots(1, 2, figsize=(16,4), sharex=True)
fig.suptitle('Amount/transaction', fontsize=16)
colors = ["#0101DF", "#DF0101"]

sns.distplot(df[df['Class']==1].Amount, ax=ax[0], color=colors[0])
plt.xlabel('Amount')
plt.ylabel('Number of Transactions')
ax[0].set_title('Distribution of Transaction Amount (Fraud)')    
sns.distplot(df[df['Class']==0].Amount, ax=ax[1], color=colors[1])
ax[1].set_title('Distribution of Transaction Amount (Valid)')
plt.xlabel('Amount')
plt.ylabel('Number of Transactions')
plt.xlim((0, 20000))
plt.yscale('log')
plt.show()


# scatter plot of the fraudulent and non fraudulent data against time   
fig, ax = plt.subplots(1, 2, figsize=(16,4), sharex=True)
fig.suptitle('Time of transaction vs Amount', fontsize=16)
colors = ["#0101DF", "#DF0101"]
ax[0].scatter(df[df['Class']==1].Time, df[df['Class']==1].Amount)
ax[0].set_title('Fraud')
plt.xlabel('Time')
plt.ylabel('Amount')
ax[1].scatter(df[df['Class']==0].Time, df[df['Class']==0].Amount)
ax[1].set_title('Valid')
plt.xlabel('Time')
plt.ylabel('Amount')
plt.show()

def plotCorrelationMatrix(): features = df.columns.values

correlation_matrix = df.corr()
fig = plt.figure(figsize=(12,8))
fig.suptitle('Correlation Plot', fontsize=16)
sns.heatmap(correlation_matrix,vmax=0.8,square = True)
plt.show()
correlations = df[features].corr().abs().unstack().sort_values(kind="quicksort").reset_index()
correlations = correlations[correlations['level_0'] != correlations['level_1']]
print("Top 6 correlated features")
print(correlations.head(5)) #
print("\n Least 6 correlated features")
print(correlations.tail(5))

def preprocessing(X,y): #Splitting the Data

X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.3, random_state=0)

print("Transactions in X_train dataset: ", X_train.shape)
print("Transaction classes in y_train dataset: ", y_train.shape)

print("Transactions in X_test dataset: ", X_test.shape)
print("Transaction classes in y_test dataset: ", y_test.shape)

#Feature Scaling

scaler_amount = StandardScaler()
scaler_time = StandardScaler()

#   normalising
X_train['normAmount'] = scaler_amount.fit_transform(X_train['Amount'].values.reshape(-1, 1))

X_test['normAmount'] = scaler_amount .transform(X_test['Amount'].values.reshape(-1, 1))

X_train['normTime'] = scaler_time .fit_transform(X_train['Time'].values.reshape(-1, 1))

X_test['normTime'] = scaler_time .transform(X_test['Time'].values.reshape(-1, 1))


X_train = X_train.drop(['Time', 'Amount'], axis=1)
X_test = X_test.drop(['Time', 'Amount'], axis=1)

X_train.head()
return X_train, X_test, y_train, y_test

def distribution():

raw_df= data
cleaned_df = raw_df.copy()

# You don't want the `Time` column.
cleaned_df.pop('Time')

# The `Amount` column covers a huge range. Convert to log-space.
eps=0.001 # 0 => 0.1¢
cleaned_df['Log Ammount'] = np.log(cleaned_df.pop('Amount')+eps)

# Use a utility from sklearn to split and shuffle our dataset.
train_df, test_df = train_test_split(cleaned_df, test_size=0.2)
train_df, val_df = train_test_split(train_df, test_size=0.2)

# Form np arrays of labels and features.
train_labels = np.array(train_df.pop('Class'))
bool_train_labels = train_labels != 0
val_labels = np.array(val_df.pop('Class'))
test_labels = np.array(test_df.pop('Class'))

train_features = np.array(train_df)
val_features = np.array(val_df)
test_features = np.array(test_df)

scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)

val_features = scaler.transform(val_features)
test_features = scaler.transform(test_features)

train_features = np.clip(train_features, -5, 5)
val_features = np.clip(val_features, -5, 5)
test_features = np.clip(test_features, -5, 5)


print('Training labels shape:', train_labels.shape)
print('Validation labels shape:', val_labels.shape)
print('Test labels shape:', test_labels.shape)

print('Training features shape:', train_features.shape)
print('Validation features shape:', val_features.shape)
print('Test features shape:', test_features.shape)
pos_df = pd.DataFrame(train_features[ bool_train_labels], columns = train_df.columns)
neg_df = pd.DataFrame(train_features[~bool_train_labels], columns = train_df.columns)

sns.jointplot(pos_df['V5'], pos_df['V6'],
              kind='hex', xlim = (-5,5), ylim = (-5,5))
plt.suptitle("Positive distribution")

sns.jointplot(neg_df['V5'], neg_df['V6'],
              kind='hex', xlim = (-5,5), ylim = (-5,5))
_ = plt.suptitle("Negative distribution")    

def using_smote(X_train, X_test, y_train, y_test):

print("Before over-sampling:\n", y_train['Class'].value_counts())

sm = SMOTE()

X_train_res, y_train_res = sm.fit_sample(X_train, y_train['Class'])

print("After over-sampling:\n", y_train_res.value_counts())

#Build the Model


parameters = {"penalty": ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
lr = LogisticRegression()
clf = GridSearchCV(lr, parameters, cv=5, verbose=5, n_jobs=3)
k = clf.fit(X_train_res, y_train_res)

print(k.best_params_)

#Evaluate the Model
lr_gridcv_best = clf.best_estimator_

y_test_pre = lr_gridcv_best.predict(X_test)

cnf_matrix_test = confusion_matrix(y_test, y_test_pre)

print("Recall metric in the test dataset:", (cnf_matrix_test[1,1]/(cnf_matrix_test[1,0]+cnf_matrix_test[1,1] )))

y_train_pre = lr_gridcv_best.predict(X_train_res)

cnf_matrix_train = confusion_matrix(y_train_res, y_train_pre)

print("Recall metric in the train dataset:", (cnf_matrix_train[1,1]/(cnf_matrix_train[1,0]+cnf_matrix_train[1,1] )))

print(classification_report(y_test,y_test_pre))


return k,X_test, y_test, X_train_res, y_train_res

def confusion(k,X_test, y_test, X_train_res, y_train_res):
#Visualize the Confusion Matrix

plt.style.use('seaborn')
class_names = ['Not Fraud', 'Fraud']
plot_confusion_matrix(k, X_test, y_test,  values_format = '.5g', display_labels=class_names)
plt.title("Test data Confusion Matrix")
plt.show()
plot_confusion_matrix(k, X_train_res, y_train_res,  values_format = '.5g', display_labels=class_names) 
plt.title("Oversampled Train data Confusion Matrix")
plt.show()

def ROC(X_test,y_test): y_k = k.decision_function(X_test)

fpr, tpr, thresholds = roc_curve(y_test, y_k)

roc_auc = auc(fpr, tpr)
print("ROC-AUC:", roc_auc)
# Now visualize the roc_auc curve.
plt.style.use('seaborn')
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b',label='AUC = %0.3f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.0])
plt.ylim([-0.1,1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

def plotScatterMatrix(df, plotSize, textSize): df = df.select_dtypes(include =[np.number]) # keep only numerical columns # Remove rows and columns that would lead to df being singular df = df.dropna('columns') df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values columnNames = list(df) if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots columnNames = columnNames[:10] df = df[columnNames] ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plotSize, plotSize], diagonal='kde') corrs = df.corr().values for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)): ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=textSize) plt.suptitle('Scatter and Density Plot') plt.show()

def regression():

log_reg_params = {"penalty": ['l1', 'l2'], 'C': [0.01, 0.1, 1]}
grid_log_reg = GridSearchCV(LogisticRegression(), log_reg_params)
grid_log_reg.fit(X_train, y_train)
log_reg = grid_log_reg.best_estimator_
log_reg_score = cross_val_score(log_reg, X_train, y_train, cv=5)
print('Logistic Regression Cross Validation Score: ', round(log_reg_score.mean() * 100, 2).astype(str) + '%')

from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import learning_curve

'''def plot_learning_curve(estimator1, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    f, ((ax1)) = plt.subplots(1,1, figsize=(16,8), sharey=True)
    if ylim is not None:
        plt.ylim(*ylim)
        train_sizes, train_scores, test_scores = learning_curve(
            estimator1, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
        train_scores_mean = np.mean(train_scores, axis=1)
        train_scores_std = np.std(train_scores, axis=1)
        test_scores_mean = np.mean(test_scores, axis=1)
        test_scores_std = np.std(test_scores, axis=1)
        ax1.fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="#ff9124")
        ax1.fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1, color="#2492ff")
        ax1.plot(train_sizes, train_scores_mean, 'o-', color="#ff9124",
                 label="Training score")
        ax1.plot(train_sizes, test_scores_mean, 'o-', color="#2492ff",
                 label="Cross-validation score")
        ax1.set_title("Logistic Regression Learning Curve", fontsize=14)
        ax1.set_xlabel('Training size (m)')
        ax1.set_ylabel('Score')
        ax1.grid(True)
        ax1.legend(loc="best")
        return plt'''

cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=21)
#plot_learning_curve(log_reg, X_train, y_train, (0.87, 1.01), cv=cv, n_jobs=4)
log_reg_pred = cross_val_predict(log_reg, X_train, y_train, cv=5,
                         method="decision_function")

print('Logistic Regression: ', roc_auc_score(y_train, log_reg_pred))

def Isolation_forest_algorithm():

classifiers = {
    "Isolation Forest":IsolationForest(n_estimators=100, max_samples=len(X), verbose=0)}

for i, (clf_name,clf) in enumerate(classifiers.items()):
    clf.fit(X_train)
    scores_prediction = clf.decision_function(X_train)
    y_pred = clf.predict(X_train)
    y_pred[y_pred == 1] = 0
    y_pred[y_pred == -1] = 1
    n_errors = (y_pred != y_train).sum()

    print("{}: {}".format(clf_name,n_errors))
    print("Accuracy Score :")
    print(accuracy_score(y_train,y_pred))
    print("Classification Report :")
    print(classification_report(y_train,y_pred))
    print('\n')

def knn(): knn = KNeighborsClassifier(n_neighbors = 5,n_jobs=16) knn.fit(X_train,y_train) k1=knn.fit(X_train,y_train) print("") print("knn classifier created") score = knn.score(X_test,y_test) print("knn model score-") print(score) pred = knn.predict(X_test) print(classification_report(y_test,pred)) matrix=confusion_matrix(y_test,pred) print(matrix) plt.figure(figsize = (10,7)) sns.heatmap(matrix,annot=True) prob=knn.predict_proba(X_test) fpr, tpr, thresholds = roc_curve(y_test, prob[:,1]) plt.figure(figsize=(10,6)) plt.plot([0, 1], [0, 1], linestyle='--') plt.plot(fpr, tpr, marker='.') plt.annotate('Minimum ROC Score of 50% \n (This is the minimum score to get)', xy=(0.5, 0.5), xytext=(0.6, 0.3), arrowprops=dict(facecolor='#6E726D', shrink=0.05), ) plt.title("ROC curve") plt.xlabel('false positive rate') plt.ylabel('true positive rate') plt.show()

#MAIN FUNCTION

#Exploring the Class Column X = data.loc[:, data.columns != 'Class'] y = data.loc[:, data.columns == 'Class']

nRowsRead = 1000 df1 = pd.read_csv('creditcard.csv', delimiter=',', nrows = nRowsRead) df1.dataframeName = 'creditcard.csv' nRow, nCol = df1.shape print(f'There are {nRow} rows and {nCol} columns')

#FUNCTION CALL

imbalance() X_train, X_test, y_train, y_test=preprocessing(X,y) k,X_test, y_test, X_train_res, y_train_res=using_smote(X_train, X_test, y_train, y_test) data_visualisation() distribution() plotCorrelationMatrix() plotScatterMatrix(df1, 20, 10)

confusion(k,X_test, y_test, X_train_res, y_train_res) ROC(X_test, y_test) knn() X_train, X_test, y_train, y_test=preprocessing(X,y) X_train = X_train.replace([np.inf, -np.inf], np.nan).fillna(0) X_train = X_train.values X_test = X_test.replace([np.inf, -np.inf], np.nan).fillna(0) X_test = X_test.values y_train = y_train.replace([np.inf, -np.inf], np.nan).fillna(0) y_train = y_train.values y_test = y_test.replace([np.inf, -np.inf], np.nan).fillna(0) y_test = y_test.values Isolation_forest_algorithm()

#activate this code separetly, for the regression plot '''regression()'''

About

Credit Card Fraud Detection algorithm using smote , confusion matrix, correlation matrix, density plots and ROC-AUC curve . Model - Logistic Regression, Knn, Isolation Forest

Topics

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published