In [1]:
import datetime
import time
import statistics

import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from sklearn import neighbors, svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score

import numpy as np
import pandas as pd

# Parse Data

In [2]:
df = pd.read_csv('assignment-data/data_for_student_case.csv', dtype={'bin':str, 'amount':int})
df = df.drop(['txid','bookingdate'], axis='columns')  # We are not interested in these columns (using booking date would be cheating)
df = df.rename(index=str, columns={'issuercountrycode':'issuercountry', 
                                   'bin':'issuer_id', 
                                   'shoppercountrycode':'shoppercountry', 
                                   'shopperinteraction':'interaction', 
                                   'cardverificationcodesupplied':'verification', 
                                   'cvcresponsecode':'cvcresponse', 
                                   'creationdate':'creationdate_stamp', 
                                   'simple_journal':'label'})  # Use column names which are more recognizable

# Skip data if:
df = df[df['label']!='Refused']
df = df[~df['issuer_id'].str.contains('na', case=False)]
df = df[~df['mail_id'].str.contains('na', case=False)]

# Create and format (new) columns
df['creationdate'] = (pd.to_datetime(df['creationdate_stamp'])).dt.date
df['mail_id'] = pd.to_numeric(df['mail_id'].str.replace('email','')).astype(int)
df['ip_id'] = pd.to_numeric(df['ip_id'].str.replace('ip','')).astype(int)
df['card_id'] = pd.to_numeric(df['card_id'].str.replace('card','')).astype(int)

# Label the data
df['label'] = df['label'].apply(lambda x: '1' if x == 'Chargeback' else '0')

df.loc[df['shoppercountry'] == df['issuercountry'], 'home_country'] = 1
df.loc[df['shoppercountry'] != df['issuercountry'], 'home_country'] = 0


# Preprocess Data

In [3]:
print("Total amount:", len(df))
fraud = df[df['label']=='1']
number_of_fraudulent_cases = len(fraud)
print("Number of fraudulent cases:", number_of_fraudulent_cases)
benign = df[df['label']=='0']
number_of_benign_cases = len(benign)
print("Number of benign cases:", number_of_benign_cases)

Total amount: 236698
Number of fraudulent cases: 345
Number of benign cases: 236353


In [4]:
not_same_country_benign = benign[benign['home_country'] != 1]
not_same_country_fraud = fraud[fraud['home_country'] != 1]
print(len(not_same_country_benign) / len(benign))
print(len(not_same_country_fraud) / len(fraud))

0.02904976877805655
0.04057971014492753


In [5]:
converter = {
    'AUD': 0.702495,
    'GBP': 1.305505,
    'MXN': 0.05274,
    'NZD': 0.6632,
    'SEK': 0.104965
}


# Function that can take two input values (amount, currency) and convert it to USD (using current ratios, not historic ones)
def convert_to_usd(args):
    amount, currency = args
    return converter[currency] * amount / 100

# Create a new column containing the transaction amount in USD to be able to compare the transaction amounts.
df['usd_amount'] = df[['amount', 'currencycode']].apply(convert_to_usd, axis=1)

# Create new dataframe with average expense per customer
avg_expense = df.groupby('card_id')['usd_amount'].mean().reset_index().rename(columns={'usd_amount': 'avg_amount'})
# Merge this new dataframe with our parsed dataset to obtain a column with average amounts
df = pd.merge(df, avg_expense, on='card_id')
# We are actually mainly interested in the difference between 
# the average transaction amount of this customer and the current transaction amount.
df['dif_avg_amount'] = df['usd_amount'] - df['avg_amount']


# Number of transactions in this currency
number_transactions_currency = df.groupby('card_id', 'currencycode')['amount'].agg('count').reset_index()
df = pd.merge(df, avg_expense, on='card_id')
df['dif_ntc'] = df['usd_amount'] - df['avg_amount']
# Number of transactions in this country
# Average amount same merchant
# Number same merchant



SyntaxError: invalid syntax (<ipython-input-5-e8243c74e9b8>, line 28)

# Find interesting relationships in the data

In [None]:
# The raw amounts from the fraud/benign transactions per country were further 
# processed in libre office to produce the graphs contained in the report (due to bug in pandas for python < 3.7).
amount_fraud_per_country = fraud.groupby('shoppercountry')['amount'].agg(['count']).reset_index()
fraudulent_countries = list(amount_fraud_per_country['shoppercountry'])
amount_fraud_per_country.plot()

amount_benign_per_country = benign.groupby('shoppercountry')['amount'].agg(['count']).reset_index()
amount_benign_per_country = amount_benign_per_country[pd.DataFrame(amount_benign_per_country.shoppercountry.tolist()).isin(fraudulent_countries).any(1)]
amount_benign_per_country.plot()

In [None]:
# Calculate average transaction amounts.
avg_usd_amount_fraud = statistics.mean(fraud['usd_amount'])
avg_usd_amount_benign = statistics.mean(benign['usd_amount'])
print("Average amount in USD for the fraudulent cases:", avg_usd_amount_fraud)
print("Average amount in USD for the benign cases:", avg_usd_amount_benign)