In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../input/creditcardfraud/creditcard.csv')

In [None]:
df.head(10)

In [None]:
df.columns


Due to confidentiality issues, the original features and background information about the data is not provided.
Features V1, V2, … V28 are the principal components obtained with PCA

In [None]:
df.shape

There are 284807 rows and 31 features

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.hist(figsize = (20, 20))
plt.show()

In [None]:
# analysing the fraud and not_fraud cases separately
print("Valid Transactions:")
df.Time[df.Class==0].describe()


In [None]:
df.Amount[df.Class==0].describe()

In [None]:
print("Fraudelent Transactions:")
df.Time[df.Class==1].describe()

We can notice that the number of fraud cases is very few compared to the number of valid transactions which portrays that it is an imbalanced dataset. 

In [None]:
df.Amount[df.Class==1].describe()

We can also notice that the min for fraudelent transaction amount is 0 and for valid, its 406. And most of the fraud cases' amounts tend to be lower than that of valid transactions. Even the max amount of frauds is only 2125 whereas for valid its 25691.
We can see in the below graph too that it is clearly a skewed distribution.  

In [None]:
colors = ["green", "red"]
sns.countplot('Class', data=df, palette=colors)
plt.title('Frequency of fraud and valid transactions', fontsize=14)

In [None]:
#OUTLIERS
#FINDING IQR
Q1=df.quantile(0.25)
Q3=df.quantile(0.75)
IQR=Q3-Q1
#COUNTING OUTLIERS IN THE ENTIRE DATASET
print("Count of outliers in the dataset=",((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).sum().sum())

In [None]:
fraud = df[df['Class'] == 1]
not_fraud = df[df['Class'] == 0]
fraud_perc=(len(fraud)/len(df))*100
fraud_perc

In [None]:
seconds=3600
plt.figure(figsize=(15,10))
plt.scatter((not_fraud.Time/(seconds)), not_fraud.Amount, alpha=0.6, label='Not_Fraud')
plt.scatter((fraud.Time/(seconds)), fraud.Amount, alpha=0.9, label='Fraud')
plt.title("Transaction amount per hour")
plt.xlabel("Transaction time ")
plt.ylabel('Amount (USD)')
plt.legend(loc='upper left')
plt.show()

We can see that most of the frauds have a lower amount than valid transactions

In [None]:
df.corr(method="pearson").head(10)

In [None]:
corrmat = df.corr()
fig = plt.figure(figsize = (12, 9))
sns.heatmap(corrmat, vmax = .8, square = True)
plt.show()

# Unsupervised Algorithms

Local Outlier Factor

In [None]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.neighbors import LocalOutlierFactor
x1= df.drop('Class',axis = 1) 
y1= df['Class'] 
outlier_frac=len(fraud)/float(len(not_fraud))
lof=LocalOutlierFactor(n_neighbors = 20,contamination = outlier_frac)
y_pred1 = lof.fit_predict(x1)
y_pred1[y_pred1 == 1] = 0
y_pred1[y_pred1 == -1] = 1
n_errors = (y_pred1 != y1).sum()
print('LOCAL OUTLIER FACTOR: {}'.format(n_errors))
print('Accuracy:',accuracy_score(y1, y_pred1))
print('Classification report')
print(classification_report(y1, y_pred1))

Isolation Forest

In [None]:
from sklearn.ensemble import IsolationForest
ifor=IsolationForest(max_samples=len(x1),contamination=outlier_frac,random_state=1)
ifor.fit(x1)
ifor_scores = ifor.decision_function(x1)
y_pred2 = ifor.predict(x1)
y_pred2[y_pred2 == 1] = 0
y_pred2[y_pred2 == -1] = 1
n_errors = (y_pred2 != y1).sum()
print('ISOLATION FOREST: {}'.format(n_errors))
print('Accuracy:',accuracy_score(y1, y_pred2))
print('Classification report')
print(classification_report(y1, y_pred2))

# Supervised Algorithms

Credit Card Fraud Detection using Random **Forest**

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, f1_score, classification_report, balanced_accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error
import joblib
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE


In [None]:
df['Amount']=StandardScaler().fit_transform(np.array(df['Amount']).reshape(-1, 1))
df=df.drop(['Time'],axis=1)
print(df.head())
df_non_fraud=df[df['Class']==0]
df_fraud=df[df['Class']==1]
print(df_non_fraud.shape)
print(df_fraud.shape)

In [None]:
X=df[df.columns[:-1]]
y=df[df.columns[-1]]

In [None]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)
smote=SMOTE(random_state=42)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
x_train_ov,y_train_ov=smote.fit_resample(x_train,y_train)
print(x_train_ov.shape)
print(y_train_ov.shape)

In [None]:
print(x_train_ov.head())

In [None]:
from sklearn.ensemble import RandomForestClassifier
# random forest model creation
rfc = RandomForestClassifier()
rfc.fit(x_train_ov,y_train_ov)
# predictions
y_pred = rfc.predict(x_test)

In [None]:
#Evaluating the classifier
#printing every score of the classifier
#scoring in any thing
from sklearn.metrics import classification_report, accuracy_score,precision_score,recall_score,f1_score,matthews_corrcoef
from sklearn.metrics import confusion_matrix
# n_outliers = len(Fraud)
# n_errors = (y_pred != Y_test).sum()
print("The model used is Random Forest classifier")
acc= accuracy_score(y_pred,y_test)
print("The accuracy is  {}".format(acc))
prec= precision_score(y_test,y_pred)
print("The precision is {}".format(prec))
rec= recall_score(y_test,y_pred)
print("The recall is {}".format(rec))
f1= f1_score(y_test,y_pred)
print("The F1-Score is {}".format(f1))
MCC=matthews_corrcoef(y_test,y_pred)
print("The Matthews correlation coefficient is {}".format(MCC))


#printing the confusion matrix
LABELS = ['Normal', 'Fraud']
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(12, 12))
sns.heatmap(conf_matrix, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt="d");
plt.title("Confusion matrix")
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.show()

# Run classification metrics
# plt.figure(figsize=(9, 7))
# print('{}: {}'.format("Random Forest", n_errors))
# print(accuracy_score(Y_test, y_pred))
# print(classification_report(Y_test, y_pred))

Credit Card Fraud Detection using **SVM**

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score, precision_recall_curve
from sklearn.metrics import roc_auc_score, roc_curve, auc, average_precision_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from mlxtend.plotting import plot_confusion_matrix
import itertools

In [None]:
data = pd.read_csv('../input/creditcardfraud/creditcard.csv') # Reading the file .csv
df = pd.DataFrame(data) # Converting data to Panda DataFrame

In [None]:
df_corr = df.corr() # Calculation of the correlation coefficients in pairs, with the default method:
                    # Pearson, Standard Correlation Coefficient

In [None]:
rank = df_corr['Class'] # Retrieving the correlation coefficients per feature in relation to the feature class
df_rank = pd.DataFrame(rank) 
df_rank = np.abs(df_rank).sort_values(by='Class',ascending=False) # Ranking the absolute values of the coefficients
                                                                  # in descending order
df_rank.dropna(inplace=True) # Removing Missing Data (not a number)

In [None]:
# We seperate ours data in two groups : a train dataset and a test dataset

# First we build our train dataset
df_train_all = df[0:150000] # We cut in two the original dataset
df_train_1 = df_train_all[df_train_all['Class'] == 1] # We seperate the data which are the frauds and the no frauds
df_train_0 = df_train_all[df_train_all['Class'] == 0]
print('In this dataset, we have ' + str(len(df_train_1)) +" frauds so we need to take a similar number of non-fraud")

df_sample=df_train_0.sample(300)
df_train = df_train_1.append(df_sample) # We gather the frauds with the no frauds. 
df_train = df_train.sample(frac=1) # Then we mix our dataset

In [None]:
X_train = df_train.drop(['Time', 'Class'],axis=1) # We drop the features Time (useless), and the Class (label)
y_train = df_train['Class'] # We create our label
X_train = np.asarray(X_train)
y_train = np.asarray(y_train)

In [None]:
############################## with all the test dataset to see if the model learn correctly ##################
df_test_all = df[150000:]

X_test_all = df_test_all.drop(['Time', 'Class'],axis=1)
y_test_all = df_test_all['Class']
X_test_all = np.asarray(X_test_all)
y_test_all = np.asarray(y_test_all)

In [None]:
X_train_rank = df_train[df_rank.index[1:11]] # We take the first ten ranked features
X_train_rank = np.asarray(X_train_rank)

In [None]:
############################## with all the test dataset to see if the model learn correctly ##################
X_test_all_rank = df_test_all[df_rank.index[1:11]]
X_test_all_rank = np.asarray(X_test_all_rank)
y_test_all = np.asarray(y_test_all)

In [None]:
class_names=np.array(['0','1']) # Binary label, Class = 1 (fraud) and Class = 0 (no fraud)

In [None]:
# Function to plot the confusion Matrix
def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = 'd' 
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
classifier = svm.SVC(kernel='linear') # We set a SVM classifier, the default SVM Classifier (Kernel = Radial Basis Function)

In [None]:
classifier.fit(X_train, y_train) # Then we train our model, with our balanced data train.

In [None]:
prediction_SVM_all = classifier.predict(X_test_all) #And finally, we predict our data test.

In [None]:
cm = confusion_matrix(y_test_all, prediction_SVM_all)
plot_confusion_matrix(cm,class_names)

In [None]:
print('We have detected ' + str(cm[1][1]) + ' frauds / ' + str(cm[1][1]+cm[1][0]) + ' total frauds.')
print('\nSo, the probability to detect a fraud is ' + str(cm[1][1]/(cm[1][1]+cm[1][0])))
print("the accuracy is : "+str((cm[0][0]+cm[1][1]) / (sum(cm[0]) + sum(cm[1]))))

In [None]:
#Evaluating the classifier
#printing every score of the classifier
#scoring in any thing
from sklearn.metrics import classification_report, accuracy_score,precision_score,recall_score,f1_score,matthews_corrcoef
from sklearn.metrics import confusion_matrix
print("The model used is SVM")
acc= accuracy_score(prediction_SVM_all,y_test_all)
print("The accuracy is  {}".format(acc))
prec= precision_score(y_test_all,prediction_SVM_all)
print("The precision is {}".format(prec))
rec= recall_score(y_test_all,prediction_SVM_all)
print("The recall is {}".format(rec))
f1= f1_score(y_test_all,prediction_SVM_all)
print("The F1-Score is {}".format(f1))
