IDENTIFYING OUTLIERS USING STANDARD DEVIATION

In [19]:
# initial imports
import pandas as pd
import numpy as np
import random
from sqlalchemy import create_engine
from numpy import mean
from numpy import std
from numpy import percentile
import plotly.express as px

In [20]:
# create connection to database
connection_string = "mysql://root:123456789@localhost:3306/fraud_detection"
engine = create_engine(connection_string)

# load data from the database
def execute_query(query):
    transaction_df = pd.read_sql(sql=query, con=engine, index_col='date', parse_dates='date')
    return transaction_df

In [24]:
# loading data of daily transactions from jan to jun 2018 for card holder 25
query = f'SELECT a.id, a.name, b.card, c.date, c.amount, e.name as "category" \
    FROM \
        card_holder a \
        INNER JOIN credit_card b ON a.id = b.id_card_holder \
        INNER JOIN transaction_table c ON b.card = c.card \
        INNER JOIN merchant d ON c.id_merchant = d.id \
        INNER JOIN merchant_category e ON d.id_merchant_category = e.id;'

transaction_df = execute_query(query)
transaction_df.head()

Unnamed: 0_level_0,id,name,card,amount,category
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-04-30 18:50:48,1,Robert Johnson,3517111172421930,5.62,bar
2018-05-05 17:49:05,1,Robert Johnson,3517111172421930,8.42,restaurant
2018-09-09 10:30:47,1,Robert Johnson,3517111172421930,8.31,food truck
2018-05-21 09:19:58,1,Robert Johnson,3517111172421930,4.57,pub
2018-08-21 16:23:34,1,Robert Johnson,3517111172421930,1.1,food truck


In [26]:
# code a function to indentify outliers based on standard deviation

# calculate summary statistics
data_mean, data_std = mean(transaction_df['amount']), std(transaction_df['amount'])

# identify outliers
cut_off = data_std * 3

lower, upper = data_mean - cut_off, data_mean + cut_off

# identify outliers
outliers = [x for x in transaction_df['amount'] if x<lower or x>upper]
print(f'Identified Outliers: {len(outliers)}')

# remove outliers
outliers_removed = [x for x in transaction_df['amount'] if x>=lower and x<=upper]
print(f'Non-outlier Observations: {len(outliers_removed)}')

transaction_df['outlier'] = (transaction_df['amount']>upper) | (transaction_df['amount']<lower)

outlier = transaction_df[transaction_df['outlier']==True]
outlier

Identified Outliers: 77
Non-outlier Observations: 3423


Unnamed: 0_level_0,id,name,card,amount,category,outlier
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-09-26 08:48:40,1,Robert Johnson,4761049645711555811,1060.0,restaurant,True
2018-12-30 23:23:09,1,Robert Johnson,4761049645711555811,1033.0,pub,True
2018-09-06 08:28:55,1,Robert Johnson,4761049645711555811,1017.0,bar,True
2018-09-06 21:55:02,1,Robert Johnson,4761049645711555811,1056.0,restaurant,True
2018-09-04 01:35:39,1,Robert Johnson,4761049645711555811,1790.0,coffee shop,True
...,...,...,...,...,...,...
2018-06-04 03:46:15,25,Nancy Contreras,4319653513507,1162.0,pub,True
2018-12-18 13:33:37,25,Nancy Contreras,4319653513507,1074.0,coffee shop,True
2018-08-16 10:01:00,25,Nancy Contreras,4319653513507,1001.0,food truck,True
2018-06-22 06:16:50,25,Nancy Contreras,4319653513507,1813.0,bar,True


In [30]:
# find anomalous transactions for 3 random card holders
import datetime
start_time = datetime.time(7,0,0)
end_time = datetime.time(9,0,0)

anomalous_transactions = outlier.between_time(start_time,end_time).sort_values('amount',ascending=False)

px.scatter(anomalous_transactions,x='name',y='amount',color='category',title='Anomalous Transactions')





Identifying Outliers Using Interquartile Range

In [32]:
# code a function to identify outliers based on interquartile range

# calculate interquartile range
q25, q75 = percentile(transaction_df['amount'],25), percentile(transaction_df['amount'],75)
iqr = q75 - q25
print(f'Percentiles: 25th = {round(q25,3)}, 75th = {round(q75,3)}, IQR = {round(iqr,3)}')

# calculate the outlier cutoff
cut_off = iqr * 1.5
lower, upper = q25 - cut_off, q75 + cut_off

# identify outliers
outliers_2 = [x for x in transaction_df['amount'] if x < lower or x > upper]
print(f'Identified Outliers: {len(outliers_2)}')

# remove outliers
outliers_removed_2 = [x for x in transaction_df['amount'] if x>=lower and x<=upper]
print(f'Non-outlier Observations: {len(outliers_removed_2)}')


transaction_df['outlier'] = (transaction_df['amount']>upper) | (transaction_df['amount']<lower)

outlier_2 = transaction_df[transaction_df['outlier'] == True]
outlier_2

Percentiles: 25th = 3.735, 75th = 14.648, IQR = 10.913
Identified Outliers: 110
Non-outlier Observations: 3390


Unnamed: 0_level_0,id,name,card,amount,category,outlier
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-11-26 20:54:39,1,Robert Johnson,4761049645711555811,267.0,food truck,True
2018-09-26 08:48:40,1,Robert Johnson,4761049645711555811,1060.0,restaurant,True
2018-12-30 23:23:09,1,Robert Johnson,4761049645711555811,1033.0,pub,True
2018-09-06 08:28:55,1,Robert Johnson,4761049645711555811,1017.0,bar,True
2018-09-06 21:55:02,1,Robert Johnson,4761049645711555811,1056.0,restaurant,True
...,...,...,...,...,...,...
2018-12-18 13:33:37,25,Nancy Contreras,4319653513507,1074.0,coffee shop,True
2018-08-16 10:01:00,25,Nancy Contreras,4319653513507,1001.0,food truck,True
2018-10-28 02:12:58,25,Nancy Contreras,4319653513507,137.0,pub,True
2018-06-22 06:16:50,25,Nancy Contreras,4319653513507,1813.0,bar,True


In [33]:
# find anomalous transactions for 3 random card holders
anomalous_transactions2 = outlier_2.between_time(start_time,end_time).sort_values('amount',ascending=False)
anomalous_transactions2

Unnamed: 0_level_0,id,name,card,amount,category,outlier
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-12-07 07:22:03,1,Robert Johnson,4761049645711555811,1894.0,bar,True
2018-03-05 08:26:08,16,Crystal Clark,5570600642865857,1617.0,bar,True
2018-03-06 07:18:09,25,Nancy Contreras,4319653513507,1334.0,bar,True
2018-01-22 08:07:03,16,Crystal Clark,5570600642865857,1131.0,restaurant,True
2018-09-26 08:48:40,1,Robert Johnson,4761049645711555811,1060.0,restaurant,True
2018-09-06 08:28:55,1,Robert Johnson,4761049645711555811,1017.0,bar,True
2018-03-26 07:41:59,9,Laurie Gibbs,30181963913340,1009.0,coffee shop,True
2018-12-14 08:51:41,12,Megan Price,501879657465,748.0,pub,True
2018-04-01 07:17:21,25,Nancy Contreras,4319653513507,100.0,coffee shop,True


In [34]:
px.scatter(anomalous_transactions2, x = 'name', y = 'amount', color = 'category', title = 'Early Hour Transactions')





WHEN WE USED STD_DEVIATION WE GOT 77 OUTLIERS AND WHEN WE USED IQR WE GOT 110 OUTLIERS. THERE ARE FRAUDULENT TRANSACTIONS IN BAR CATEGORY WHEREIN AMOUNT SPENT BETWEEN 7-9 AM IN THE BAR.