# Credit Card Fraud Detection Project

# Import Data

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import svm
RFC_METRIC = 'gini' #metrics used for RandomForestClasifier
NUM_ESTIMATORS = 100 #number of estimators used for RandomForestClassifier
NO_JOBS = 4 #number of parallel jobs used for RandomForestClassifier
VALID_SIZE = 0.20 # simple validation using train_test_split
TEST_SIZE = 0.20 # test size using_train_test_split
#Train/validation/test split
#VALIDATION
VALID_SIZE = 0.20 # simple validation using train_test_split
TEST_SIZE = 0.20 # test size using_train_test_split

#CROSS-VALIDATION
NUMBER_KFOLDS = 5 #number of KFolds for cross-validation

RANDOM_STATE = 2018

MAX_ROUNDS = 1000 #lgb iterations
EARLY_STOP = 50 #lgb early stop
OPT_ROUNDS = 1000 #To be adjusted based on best validation rounds
VERBOSE_EVAL = 50 #Print out metric result

IS_LOCAL = False
filename = '/Users/Mac/Desktop/creditcard.csv' 
cc_data = pd.read_csv(filename)
#check the data
#cc_data.head()
#cc_data.tail()
print("CreditCard data provided has rows:",cc_data.shape[0], "columns:", cc_data.shape[1])

XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ['dlopen(/Users/Mac/opt/anaconda3/lib/python3.7/site-packages/xgboost/lib/libxgboost.dylib, 6): Library not loaded: /usr/local/opt/libomp/lib/libomp.dylib\n  Referenced from: /Users/Mac/opt/anaconda3/lib/python3.7/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: image not found']


In [None]:
# Remove duplicates if present
cc_data.drop_duplicates(inplace=True)
print("After removing Duplicates, CreditCard data now has rows:",cc_data.shape[0], "columns:", cc_data.shape[1])

## A glimpse of the data by first 5 rows
cc_data.head()

### Distinguishing Fraud and Non-fraud

In [None]:
fraud = cc_data[cc_data['Class']==1]
normal = cc_data[cc_data['Class']==0]

### To know the count of Fraud and Non-fraud in dataset

In [None]:
print("Fraud:",fraud.shape,"Normal:",normal.shape)

### Statistical Exploration of the dataset

In [None]:
fraud.Amount.describe()

In [None]:
normal.Amount.describe()

### Visual exploration using matplotlib (Histogram Chart)

In [None]:
f, (ax1, ax2)=plt.subplots(2,1, sharex=True)
f.suptitle('Amount per transaction by class')
bins = 50
ax1.hist(fraud.Amount, bins = bins)
ax1.set_title('Fraud')
ax2.hist(normal.Amount, bins = bins)
ax2.set_title('Normal')
plt.xlabel('Amount ($)')
plt.ylabel('Number of Transactions')
plt.xlim((0,20000))
plt.yscale('log')
plt.show()

### Visual exploration using matplotlib (Using Scatter plot)

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
f.suptitle('Time of transaction vs amount by class')
ax1.scatter(fraud.Time, fraud.Amount)
ax1.set_title('Fraud')
ax2.scatter(normal.Time, normal.Amount)
ax2.set_title('Normal')
plt.xlabel('Time (in Seconds)')
plt.ylabel('Amount')
plt.show()

### Taking 90% of entire data for training

In [None]:
#Take some sample of the data
dataset=cc_data.sample(frac = 0.9,random_state=1)
dataset.shape

### Determine the number of fraud and valid transactions in the new dataset

In [None]:
Fraud = dataset[dataset['Class']==1]
Valid = dataset[dataset['Class']==0]
outlier_fraction = len(Fraud)/float(len(Valid))

In [None]:
print(outlier_fraction)
print("Fraud Cases : {}".format(len(Fraud)))
print("Valid Cases : {}".format(len(Valid)))

### Exploration using Heatmap to determine correlation

In [None]:
#Correlation
import seaborn as sns
#get correlations of each features in dataset
corrmat = dataset.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heat map
g=sns.heatmap(dataset[top_corr_features].corr(),annot=True,cmap="RdYlGn")

### Define X and Y; Data and Target.

In [None]:
#Build model with all of the data features
target = dataset ['Class']
predictors = dataset [['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',\
             'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19',\
             'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28',\
             'Amount']]

X = predictors #didn't apply .value cos it's already in Numpy array
y = target  #didn't apply .value cos it's already in Numpy array

#Another quick way to create X and y Datasets...
#X = dataset.drop('Class', axis = 1).values
#y = dataset['Class'].values

# Split Train and Test Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1)
# Two datasets have now been created, train data we will be used for training our model and the data which is unseen will be used for testing.


### Buliding Model

In [None]:
# Different Machine Learning Models will be tried one by one.

# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

# Accuracy of Logistic Regression model.
print('Accuracy score of the Logistic Regression model is {}'.format(accuracy_score(y_test, lr_pred)))

# Check F1-Score for the Logistic Regression model.
print('F1 score of the Logistic Regression model is {}'.format(f1_score(y_test, lr_pred)))

In [None]:
# K-Nearest Neighbors
KNN = KNeighborsClassifier(n_neighbors = 6)
KNN.fit(X_train, y_train)
knn_pred = KNN.predict(X_test)

# Accuracy of K-Nearest Neighbors model.
print('Accuracy score of the K-Nearest Neighbors model is {}'.format(accuracy_score(y_test, knn_pred)))

# Check F1-Score for the K-Nearest Neighbors model.
print('F1 score of the K-Nearest Neighbors model is {}'.format(f1_score(y_test, knn_pred)))

In [None]:
# Random Forest
RF = RandomForestClassifier(max_depth = 4)
RF.fit(X_train, y_train)
RF_pred = RF.predict(X_test)

# Accuracy of Random Forest model.
print('Accuracy score of the Random Forest model is {}'.format(accuracy_score(y_test, RF_pred)))

# Check F1-Score for the Random Forest model.
print('F1 score of the Random Forest model is {}'.format(f1_score(y_test, RF_pred)))

In [None]:
# XGBoost
XGB = XGBClassifier(max_depth = 4)
XGB.fit(X_train, y_train)
XGB_pred = xgb.predict(X_test)

# Accuracy of XGBoost model.
print('Accuracy score of the XGBoost model is {}'.format(accuracy_score(y_test, XGB_pred)))

# Check F1-Score for the XGBoost model.
print('F1 score of the XGBoost model is {}'.format(f1_score(y_test, XGB_pred)))

In [None]:
# Decision Tree
DT = DecisionTreeClassifier(max_depth = 4, criterion = 'entropy')
DT.fit(X_train, y_train)
DT_pred = DT.predict(X_test)

# Accuracy of Decision Tree
print('Accuracy score of the Decision Tree model is {}'.format(accuracy_score(y_test, DT_pred)))

#Check F1-Score for the Decision Tree
print('F1 score of the Decision Tree model is {}'.format(f1_score(y_test, DT_pred)))

#See what the confusion matrix looks like
confusion_matrix(y_test, DT_pred, labels = [0, 1])

#Here, the first row represents positive and the second row represents negative. 
#So, we have 68782 as true positive and 18 are false positive. That says, out of 68782+18=68800, 
#we have 68782 that are successfully classified as a normal transaction and 18 were falsely classified as normal, 
#but they were fraudulent.