# Credit Card Fraud Dectection

This will use machine learning and predictive modeling to help detect fradulent credit card transaction. Data is from transactions made by credit cards in September 2013 by european cardholders which can be found here: https://www.kaggle.com/mlg-ulb/creditcardfraud. 

The columns with the first five rows can be seen below in the next section.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.datasets import make_classification
from sklearn import ensemble

kaggleFile=r'/kaggle/input/creditcardfraud/creditcard.csv'
data=pd.read_csv(kaggleFile)
print(data.head())

### Find missing data

In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
df = data.select_dtypes(include=numerics)

print("Missing data by column:")
findNA=df.isnull().sum().sort_values(ascending=False)/len(data)
print(findNA.head()) #There are no missing values

#### Unbalanced Dataset
Since the amount of fraudelent transactions is 0.0017, the dataset is considered highly unbalanced.This means metrics used should be the following: recall, precision, and AOC. 

In [None]:
temp = df["Class"].value_counts()
fraud = temp[1]
notFraud= temp[0]
fraudNumbers=round((fraud/notFraud),4)
print(fraudNumbers, "percent of transactions are fraudelent.")

### Correlations using heatmap
No multicollinearity was detected. The highest correlation between features was V7 and Amount which had a 0.39730 correlation.

In [None]:
def printHeat():
    corr = df.corr()
    #print(corr)
    highly_corr_features = corr.index[abs(corr["Class"])>0.2]
    plt.figure(figsize=(10,10))
    heat = sns.heatmap(data[highly_corr_features].corr(),annot=True,cmap="RdYlGn")
    top10=corr["Class"].sort_values(ascending=False).head(10).round(4)
    print(heat)
    #print(top10) # top ten correlations

#print correlation between features
def printHighCorr(df, features, threshold=0.2):
    print("Highly correlated variables above: ", threshold)
    corr_df = df[features].corr() # get correlations
    correlated_features = np.where(np.abs(corr_df) > threshold) # select ones above the abs threshold
    correlated_features = [(corr_df.iloc[x,y], x, y) for x, y in zip(*correlated_features) if x != y and x < y] # avoid duplication
    s_corr_list = sorted(correlated_features, key=lambda x: -abs(x[0])) # sort by correlation value
    correlation_df={}
    if s_corr_list == []:
        print("There are no highly correlated features with correlation above", threshold)
    else:
        for v, i, j in s_corr_list:
            correlation_df[corr_df.index[i] +" and "+ corr_df.columns[j]]= v
        correlation_df=pd.DataFrame(correlation_df,index=['Correlation']).round(4)
    return  correlation_df.T.sort_values(by='Correlation',ascending=False)

#Turn functions on or off below:
printHeat() 
printHighCorr(data,data.columns).style.set_properties(**{'background-color': 'black','color': 'white'})

### Split Data
Use 70% of the train data to predict the accuracy of the remaining 30% of the test data.

In [None]:
X=df.drop('Class', axis=1)
y=df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

 #### Gradient Boost with Feature Importance

In [None]:
#GRADIENT BOOST REGRESSION:
from sklearn.inspection import permutation_importance
from sklearn.ensemble import GradientBoostingClassifier,GradientBoostingRegressor
from sklearn import ensemble

params = {
 "n_estimators": 500, "max_depth": 4, "min_samples_split": 5, "learning_rate": 0.01,
}

reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)
gbr_r2 = r2_score(y_test, y_pred).round(4) 
print("Gradient boosting regression r2: ", gbr_r2) 

mse = mean_squared_error(y_test, reg.predict(X_test))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

#FEATURE IMPORTANCE:
cols=X.columns
feature_importance = reg.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + 0.5
fig = plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.barh(pos, feature_importance[sorted_idx], align="center")
plt.yticks(pos, np.array(cols)[sorted_idx])
plt.title("Feature Importance (MDI)")

result = permutation_importance(
    reg, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2
)
sorted_idx = result.importances_mean.argsort()
plt.subplot(1, 2, 2)
plt.boxplot(
    result.importances[sorted_idx].T,
    vert=False,
    labels=np.array(cols)[sorted_idx],
)
plt.title("Permutation Importance (test set)")
fig.tight_layout()
plt.show()

### Logistic Regression 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logReg = LogisticRegression()

#Train the model and create predictions
logReg.fit(X_train, y_train)
predictions = logReg.predict(X_test)

#use model to predict probability that given y value is 1:
y_pred_proba = logReg.predict_proba(X_test)[::,1]

#calculate AUC of model
auc = round( metrics.roc_auc_score(y_test, y_pred_proba), 4 ) 
print("AUC for logistic regression is: ", auc)

### K-Nearest Neigbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

print("starting knn")

#Selecting an optimal K value:
error_rates = []
for i in range(1, 10, 2): #Must be an odd number to break a tie
    new_model = KNeighborsClassifier(n_neighbors = i)
    new_model.fit(X_train, y_train)
    new_predictions = new_model.predict(X_test)
    error_rates.append(np.mean(new_predictions != y_test))
print("finished error calculation")

plt.figure(figsize=(16,12))
plt.plot(error_rates)
#Error had the least amount at 1. 

#Train the model and make predictions:
knn = KNeighborsClassifier(n_neighbors =1) 
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

#Performance measurement:
import sklearn.metrics as metrics
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
#print(classification_report(y_test_data, predictions))
#print(confusion_matrix(y_test_data, predictions))

#use model to predict probability that given y value is 1:
y_pred_proba = knn.predict_proba(X_test)[::,1]

#calculate AUC of model
auc = round( metrics.roc_auc_score(y_test, y_pred_proba), 4 ) 
print(auc)

#acc = metrics.accuracy_score(y_test_data, predictions) #not needed
print(confusion_matrix(y_test, predictions))

#### Resources
1. https://www.kaggle.com/jdelamorena/recall-97-by-using-undersampling-neural-network
2. https://www.kaggle.com/gpreda/credit-card-fraud-detection-predictive-models