# Capstone Project: Credit card fraud detection using ML
# Understanding data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import math
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from datetime import datetime, date
from pylab import rcParams
import seaborn as sns
import haversine as hs
import warnings
warnings.filterwarnings('ignore')


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# load the data
fraudtest = pd.read_csv('/kaggle/input/fraud-detection/fraudTest.csv')
fraudtrain = pd.read_csv('/kaggle/input/fraud-detection/fraudTrain.csv')

# drop the unnamed column
fraudtest.drop('Unnamed: 0', axis=1, inplace=True)
fraudtrain.drop('Unnamed: 0', axis=1, inplace=True)

# concanate both 
data = pd.concat([fraudtrain,fraudtest])

In [None]:
def dataset_overview(df):
    display(df.head())
    variables = df.shape[1]
    observations = df.shape[0]
    missings = df.isnull().sum().sum()
    missings_per = round(100 * df.isnull().sum().sum()/len(df),3)
    duplicated = df.duplicated().sum()
    duplicated_per = round(100 * df.duplicated().sum()/len(df),3)
    categ = len(df.select_dtypes(include=[np.number]).columns.values)
    numer = len(df.select_dtypes(exclude=[np.number]).columns.values)
    #names = ['Variables', 'Observations', 'Missings cells', 'Missing cells(%)','Duplicated rows','Duplicated rows(%)','Num cols','Categ cols']
    stats = list([variables, observations, missings, missings_per,duplicated,duplicated_per, categ, numer])
    over_df = pd.DataFrame({'General Overview':['Variables', 'Observations', 'Missings cells', 'Missing cells(%)','Duplicated rows','Duplicated rows(%)','Num cols','Categ cols'],
                           '': stats})
    over_df = over_df.set_index('General Overview')
    print(over_df)
        #################### categorical columns  ###############################################################
    print('\nVariable overview\n')
    print('Categorical variables')
    for x in list(data.select_dtypes(exclude=[np.number]).columns.values):
        print(x,'\n')
        print(f'unique values:         {len(df[x].unique())}')
        print(f'Missing values:        {df[x].isnull().sum()}')
        print(f'Missing values(%):     {df[x].isnull().sum()/len(data[x])}%')
        print(f'Mode:                  {df[x].mode()[0]}')
        print(f'Frequency:             {df[x].value_counts()[0]}\n')
        print(f'Data type:             {df[x].dtype}')
    
    #################### numerical columns  ###############################################################
    print('\nNumerical variables\n')
    for y in list(data.select_dtypes(include=[np.number]).columns.values):
        print(y,'\n')
        print(f'unique values:     {len(df[x].unique())}')
        print(f'Missing values:    {df[y].isnull().sum()}')
        print(f'Missing values(%): {df[y].isnull().sum()/len(data[y])}%')
        print(f'Minimum:           {df[y].min()}')
        print(f'Median:            {df[y].median()}')
        print(f'Mean:              {df[y].mean()}')
        print(f'Max:               {df[y].max()}')
        print(f'Data type:         {df[x].dtype}')

In [None]:
dataset_overview(data)

From the above we can observe that we have 22 columns and 1852394 observations. 

No missing variables, no duplicates, having some data types that doesn't correspond to the variables for example on date columns (trans_date_trans_time and dob).

The next step, will be to change those data types, and create some columns which can help us in our analysis

# Data Cleansing

In [None]:
# Function to calculate the distance between two adress
def haversine_vectorize(lon1, lat1, lon2, lat2):

    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    newlon = lon2 - lon1
    newlat = lat2 - lat1

    haver_formula = np.sin(newlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(newlon/2.0)**2

    dist = 2 * np.arcsin(np.sqrt(haver_formula ))
    km = 6367 * dist #6367 for distance in KM for miles use 3958
    return km

In [None]:
# clean date of DOB and trans_date_trans_time
dates_list = ['trans_date_trans_time','dob']
for x in dates_list:
    data[x] = pd.to_datetime(data[x])

In [None]:
# get hours from the transaction
data['trans_hour'] = data['trans_date_trans_time'].dt.hour
# days when the transaction occured 
data['day_of_week'] =data['trans_date_trans_time'].dt.day_name()
# period when the transaction occured
data['year_month'] =data['trans_date_trans_time'].dt.to_period('M')
# the age of the client when the transaction occured
data['age'] = (np.round((data['trans_date_trans_time'] - data['dob'])/np.timedelta64(1,'Y')))
# get the full name 
data['names'] = data['first'] + ' ' + data['last']
data.drop(['first','last'], axis=1, inplace=True)
# create the column where the if the population is less than 25% to be rural, 25-50% ssemi-urban, and more than 50% urban
data['residence'] = pd.qcut(data.city_pop, q=[0, .25, .75, 1], labels=['rural', 'semi_urban', 'urban'])

# concanate the lat and longitude of client into one column and the same for the merchant location
data['lat_long'] = tuple(zip(*data[['lat','long']].values.T))
data['merch_ad'] = tuple(zip(*data[['merch_lat','merch_long']].values.T))

In [None]:
# create the distance column
data['distance'] = haversine_vectorize(data['long'],data['lat'],data['merch_long'],data['merch_lat'])

# Exploratory Data Analysis (EDA)
## Univariate Analysis

In [None]:
# Distribution graphs (histogram/bar graph) of column data
def plotDistribution(df, nGraphShown, nGraphPerRow):
    nunique = df.nunique()
    df = df[[col for col in df if nunique[col] > 1 and nunique[col] < 30]] # For displaying purposes, pick columns that have between 1 and 30 unique values
    nRow, nCol = df.shape
    columnNames = list(df)
    nGraphRow = (nCol + nGraphPerRow - 1) / nGraphPerRow
    plt.figure(num = None, figsize = (6 * nGraphPerRow, 8 * nGraphRow), dpi = 80, facecolor = 'w', edgecolor = 'k')
    for i in range(min(nCol, nGraphShown)):
        plt.subplot(nGraphRow, nGraphPerRow, i + 1)
        columnDf = df.iloc[:, i]
        if (not np.issubdtype(type(columnDf.iloc[0]), np.number)):
            valueCounts = columnDf.value_counts()
            valueCounts.plot.bar()
        else:
            columnDf.hist()
        plt.ylabel('counts')
        plt.xticks(rotation = 90)
        plt.title(f'{columnNames[i]} (column {i})')
    plt.tight_layout(pad = 1.0, w_pad = 1.0, h_pad = 1.0)
    plt.show()

In [None]:
sns.set(style='whitegrid')
plotDistribution(data, 10, 3)

- Gas and transport is where there was the highest number of transaction,  and travel the lowest
- Female made fewer transaction than male
- we can see that there are an imbalance between this dataset between the fraudulent and non-fraudulent transaction
- we can see that more transaction are made in the night
- during the week mre transaction are made monday and sunday
- we can see that at the end of year there more transaction made
- The last but not least we can see that our columns are all right skewed

## Bivariate Analysis¶

In [None]:
data[(data['is_fraud']==1)].category.value_counts(normalize= True, ascending= False).plot(kind='bar');

In [None]:
data[(data['is_fraud']==0)].category.value_counts(normalize= True, ascending= False).plot(kind='bar');

In [None]:

fig, ax = plt.subplots(1,3,figsize=(20,5))
ax[0].hist(data[data['amt']<=1000]['amt'], bins=50)
ax[1].hist(data[(data['is_fraud']==0) & (data['amt']<=1000)]['amt'], bins=50)
ax[2].hist(data[(data['is_fraud']==1) & (data['amt']<=1000)]['amt'], bins=50)

ax[0].set_title('Overall Amount Distribution')
ax[1].set_title('Non Fraud Amount Distribution')
ax[2].set_title('Fraud Amount Distribution')

ax[0].set_xlabel('Transaction Amount')
ax[0].set_ylabel('number of Transactions')

ax[1].set_xlabel('Transaction Amount')
ax[2].set_xlabel('Transaction Amount')
plt.show()

In [None]:
bins = np.linspace(200, 2000, 100)
plt.hist(data[(data['is_fraud']==0)]['amt'], bins,alpha=1, density=True, label='Non Fraud' )
plt.hist(data[(data['is_fraud']==1)]['amt'], bins,alpha=1, density=True, label='Fraud')

plt.title('Amount by percentage of transactions')

plt.xlabel('Transaction Amount')
plt.ylabel('Percentage of Transactions')
plt.show()

In [None]:
fig, ax = plt.subplots(1,3,figsize=(20,5))
sns.distplot(data['distance'], ax= ax[0])#age distributio
sns.distplot(data[(data['is_fraud']==0)].distance, ax= ax[1]) # age distribution for fraudulent transaction
sns.distplot(data[(data['is_fraud']==1)].distance, ax= ax[2]) # age distribution for non fraudulent transaction 

ax[0].set_title('Overall transaction vs distance Distribution')
ax[1].set_title('Non Fraud transaction vs distance Distribution')
ax[2].set_title('Fraud transaction vs distance Distribution')

plt.show()

In [None]:
fig, ax = plt.subplots(1,3,figsize=(20,5))
sns.distplot(data['unix_time'], ax= ax[0])#age distributio
sns.distplot(data[(data['is_fraud']==0)].unix_time, ax= ax[1]) # age distribution for fraudulent transaction
sns.distplot(data[(data['is_fraud']==1)].unix_time, ax= ax[2])
ax[0].set_title('Overall transaction vs time between another transaction Distribution')
ax[1].set_title('Non Fraud transaction vs time between another transaction Distribution')
ax[2].set_title('Fraud transaction vs time between another transaction Distribution')

plt.show()

In [None]:
fraud = data[(data['is_fraud']==1)] # fraud df
not_fraud = data[(data['is_fraud']==0)] # non fraud dataframe

ax = fraud.groupby(fraud['year_month'])['trans_num'].nunique().reset_index().set_index('year_month').plot.bar(figsize=(20,10))
ax1 = not_fraud.groupby(not_fraud['year_month'])['trans_num'].nunique().reset_index().sort_values(by=['trans_num']).plot.bar(figsize=(20,10))

for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{:1.2f}%'.format(height/len(fraud)*100),
            ha="center", fontsize=12)
    
for t in ax1.patches:
    height = t.get_height()
    ax1.text(t.get_x()+t.get_width()/2.,
            height + 3,
            '{:1.2f}%'.format(height/len(not_fraud)*100),
            ha="center", fontsize=12) 
ax.set_title('fraudulent transaction by period')
ax1.set_title('overall transaction by period')
plt.show()

In [None]:
ax = data.groupby(data['day_of_week'])['trans_num'].nunique().reset_index().set_index('day_of_week').plot.bar(figsize=(20,10))
ax.set_ylabel('# of all the transactions')
ax.set_xlabel('days of a week')
ax.set_title('Week days vs all transaction')

for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{:1.2f}%'.format(height/len(data)*100),
            ha="center", fontsize=15)
plt.show()

In [None]:
ax = fraud.groupby(fraud['day_of_week'])['trans_num'].nunique().reset_index().set_index('day_of_week').plot.bar(figsize=(20,10))
ax.set_ylabel('# of fraudulent transactions')
ax.set_xlabel('days of a week')
ax.set_title('Week days vs fraudulent transaction')

for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{:1.2f}%'.format(height/len(fraud)*100),
            ha="center", fontsize=15)
plt.show()

In [None]:
ax = fraud.groupby(fraud['residence'])['trans_num'].nunique().reset_index().set_index('residence').plot.bar(figsize=(20,10))
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{:1.2f}%'.format(height/len(fraud)*100),
            ha="center", fontsize=15)
ax.set_title('residence vs transaction numbers')
plt.show()

- From the above figures we observed that the POS and ne t purchases have the high fraudulent compared o others 
- There are not a big difference on the distribution of the fraudulent transaction and non fraudulent transacation
- From period transaction, there were more fraudulent transaction in 2019
- In a week, there were  more fraudulent transaction on monday
- In the comparison, using the area, there were more fraudulent transaction in the semi-urban than others

## Multivariate analysis

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(data.corr(), annot=True,cmap="YlGnBu")
plt.show()

as we can see here there are a high correlation between location (longititude and latitude of the clients and merchant location)

In [None]:
import plotly.express as px 

df2_fraud = data[data['is_fraud'] == 1]

fig = px.scatter_mapbox(df2_fraud, lat="lat", lon="long", hover_name="city",
                         zoom=3, height=500,
                         color="is_fraud",  color_discrete_sequence=px.colors.cyclical.IceFire)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [None]:
dfm_fraud = data[data['is_fraud'] == 1]

fig = px.scatter_mapbox(dfm_fraud, lat="merch_lat", lon="merch_long", hover_name="city",
                         zoom=3, height=500,
                         color="is_fraud",  color_discrete_sequence=px.colors.cyclical.IceFire)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

What we can observe from the above plot is that more fraudulent transaction occurs in the east of the united states, where there were more merchant as well

# Data Preparation

In [None]:
dframe = data.copy()
dframe = dframe.drop(['cc_num','trans_date_trans_time','names', 'merchant','trans_num','street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop','dob', 'merch_lat', 'merch_long','lat_long',
       'merch_ad','job','year_month'],axis=1)

In [None]:
# creating a dummy variable for ome of the categoorical variables and drop the first ones
dummy_var1 = pd.get_dummies(dframe[['category', 'day_of_week', 'gender', 'residence']], drop_first= True)
# adding the resultss to the master dataframe
dframe = pd.concat([dframe, dummy_var1], axis=1)
#dropping the repeated variables
dframe = dframe.drop(['category', 'day_of_week', 'gender', 'residence'],1)

In [None]:
from sklearn.preprocessing import RobustScaler
# select columns to scale 
to_scale = [col for col in dframe.columns if dframe[col].max()>1]
scaler = RobustScaler()
scaled =scaler.fit_transform(dframe[to_scale])
scaled = pd.DataFrame(scaled, columns=to_scale)

# replace original columns with scaled columns
for col in scaled:
    dframe[col] = scaled[col]

In [None]:
#make a copy of this dataframe
df = dframe.copy()

# Model Building
__Attention:__ </br>Here,instead of Accuracy we are very much interested on the recall score, because that is the metric that will help us try to capture the most fraudulent transactions.

In [None]:
# import libraries needed for this step
from sklearn.model_selection import train_test_split # train-test split
from sklearn.metrics import confusion_matrix, classification_report,roc_auc_score,roc_curve # classification metrics
from imblearn.over_sampling import SMOTE # SMOTE
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler # scaling methods

import sklearn.neighbors
from sklearn.model_selection import GridSearchCV # grid search cross validation
from sklearn.model_selection import RandomizedSearchCV # randomized search cross validation

# supervised learning algorithms
from sklearn.linear_model import LogisticRegression # Logistic Regression
from sklearn.neighbors import KNeighborsClassifier # K-Nearest Neighbbors
from sklearn.naive_bayes import GaussianNB # Gaussain Naive Bayes
from sklearn.tree import DecisionTreeClassifier # Decision Tree
from sklearn.ensemble import RandomForestClassifier # Random Forest
from sklearn.ensemble import AdaBoostClassifier # Adaptive Boosting Classifier
from sklearn.ensemble import BaggingClassifier # Bootstrap Aggregating Classifier
from xgboost import XGBClassifier
import statsmodels.api as sm # estimates statistical models
from sklearn.feature_selection import RFE #Recursive Feature Elimination for feature selection
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support as score
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
X = df.drop(['is_fraud'],axis=1) 
y = df['is_fraud'] #target variable

## Test Train Split

In [None]:
#split the dataset into training set and testing set
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3, random_state=42)

## resampling using SMOTE

In [None]:
smote = SMOTE()
X_train_new, y_train_new = smote.fit_resample(X_train, y_train.ravel())


# to demonstrate the effect of SMOTE over imbalanced datasets
fig, (ax1, ax2) = plt.subplots(ncols = 2, figsize =(15, 5))
ax1.set_title('Before SMOTE')
pd.Series(y_train).value_counts().plot.pie(autopct='%.1f%%',ax=ax1)
ax1.yaxis.set_major_formatter(mtick.PercentFormatter())
ax2.set_title('After SMOTE')  
pd.Series(y_train_new).value_counts().plot.pie(autopct='%.1f%%',ax=ax2)
ax2.yaxis.set_major_formatter(mtick.PercentFormatter())
plt.show()

In [None]:
# let's see the correlation matrix
plt.figure(figsize= (30, 10))
sns.heatmap(df.corr(), annot=True, cmap= 'GnBu')
plt.show()

We can see that there arent any variable with a high correlation
## Logistic Regression

In [None]:
# logistic regression model
logml = sm.GLM(y_train_new,(sm.add_constant(X_train_new)), family=sm.families.Binomial())
logml.fit().summary()

The above all p values are statistically significant </br>
The next step, we are going to check which variables that has high importance for this model than others

### Feature selection using RFE

In [None]:
# look for best describiing features for the logistic regression model using rfe
logreg = LogisticRegression()
rfe = RFE(logreg, 15)
rfe = rfe.fit(X_train_new, y_train_new)

In [None]:
logreg.fit(X_train_new, y_train_new)
y_test_pred = logreg.predict(X_test)

In [None]:
# have a look on the list of features and the ranking they gets 
list(zip(X_train_new.columns, rfe.support_, rfe.ranking_))

In [None]:
# extract all the columns selected by rfe as best variable for our model
col = X_train_new.columns[rfe.support_]

In [None]:
#remove those with less important features
X_train_new.columns[~rfe.support_]

In [None]:
feature_imp = pd.DataFrame(list(zip(X_train_new.columns, rfe.support_, rfe.ranking_)), columns=['variables','selected','rank'])
feature_imp = feature_imp.set_index('variables')

plt.figure(figsize=[15,6])
plt.title('Features Importance')
sns.barplot(x='rank', y=feature_imp.index.values, data=feature_imp, 
            order=feature_imp.sort_values(by='rank', ascending=False).index.values, palette='rocket', )
plt.xlabel('Feature Name');

From the code above we can observe that from the age variable is really important with the rank have over 12, and frim categorykids pets up to category misc pos have a really low rank

In [None]:
# build a logistic regression using only the variables selected using rfe
X_train_sm = sm.add_constant(X_train_new[col])

logml2 = sm.GLM(y_train_new, X_train_sm, family=sm.families.Binomial())

res = logml2.fit()

res.summary()

In [None]:
## getting the predicted values on the train set
y_train_pred = res.predict(X_train_sm)
y_train_pred =y_train_pred.values.reshape(-1)

In [None]:
# put in the dataframe the probability calculated for fraud
y_train_pred_final = pd.DataFrame({'fraud':y_train_new, 'fraud_Prob': y_train_pred})
y_train_pred_final.head()

In [None]:
# creating new column 'predicted' with 1 if churn_prob > 0.5 else 0
# https://smallbiztrends.com/2019/12/payment-fraud-statistics.html
y_train_pred_final['predicted'] = y_train_pred_final.fraud_Prob.map(lambda x: 1 if x > 0.5 else 0)
y_train_pred_final.sample(50)

In [None]:
print(confusion_matrix(y_train_pred_final.fraud, y_train_pred_final.predicted))
print(classification_report(y_train_pred_final.fraud, y_train_pred_final.predicted))

In [None]:
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

## Decision Tree

In [None]:
#Building Decision Tree Model
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(criterion = 'gini', max_depth = 20, random_state=0)
dt_clf.fit(X_train_new, y_train_new)
pred_train = dt_clf.predict(X_train_new)

print(confusion_matrix(y_train_new, pred_train))
print(classification_report(y_train_new, pred_train))

In [None]:
pred_test = dt_clf.predict(X_test)

print(confusion_matrix(y_test, pred_test))
print(classification_report(y_test, pred_test))

## Random-Forest Classifier

In [None]:
#Building Random Forest Model
rf_clf = RandomForestClassifier(n_estimators = 50,max_depth = 20,verbose = 1)
rf_clf.fit(X_train_new, y_train_new)
pred_train = rf_clf.predict(X_train_new)

print(confusion_matrix(y_train_new, pred_train))
print(classification_report(y_train_new, pred_train))

In [None]:
pred_test = rf_clf.predict(X_test)

print(confusion_matrix(y_test, pred_test))
print(classification_report(y_test, pred_test))

### Gradient Boosting Model

In [None]:
#Building XG Boost Model

xbt_model = XGBClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3, verbose = 1)
xbt_model.fit(X_train_new, y_train_new)

pred_train = xbt_model.predict(X_train_new)
print(confusion_matrix(y_train_new, pred_train))
print(classification_report(y_train_new, pred_train))

In [None]:
pred_test = xbt_model.predict(X_test)
print(confusion_matrix(y_test, pred_test))
print(classification_report(y_test, pred_test))

# Business Impact

## Cost Benefit Analysis

In [None]:
data['ques'] = 1

In [None]:
print(f"Average number of transactions per month: {round(data.groupby(['year_month','ques'])['ques'].sum().mean(),2)}")
print(f"Average number of fraudulent transaction per month: {round(fraud.groupby(['year_month','is_fraud'])['is_fraud'].count().mean(),2)}")
print(f"Average amount per fraud transaction: {round(fraud.amt.sum()/len(fraud),2)}")