# Fraud Detection Models

This notebook aims to create a prediction of fraudulent purchasing in an e-commerce through both supervised and unsupervised machine learning modeling using data composed of devices and IP addresses, along with demographics used in both fraudulent and legitimate purchases.

In [1]:
# importing necessary libraries and modules

import numpy as np
import pandas as pd
import itertools

from scipy import special
from datetime import timedelta
from IPython.display import display

In [2]:
# reading necessary data
fraud_df = pd.read_csv('Fraud_Data.csv',header=0)
ISP_df = pd.read_csv('IpAddress_to_Country.csv',header=0)

In [3]:
# checking out the data types for the fraud dataset
fraud_df.dtypes

id                  int64
cadastro           object
compra             object
valor               int64
id_dispositivo     object
fonte              object
browser            object
genero             object
idade               int64
ip                float64
fraude              int64
dtype: object

In [4]:
# checking for null values for the fraud dataset
for col in fraud_df.columns:
    print('Null values in {}: {}'.format(col, fraud_df[col].isnull().sum()))

Null values in id: 0
Null values in cadastro: 0
Null values in compra: 0
Null values in valor: 0
Null values in id_dispositivo: 0
Null values in fonte: 0
Null values in browser: 0
Null values in genero: 0
Null values in idade: 0
Null values in ip: 0
Null values in fraude: 0


In [5]:
# describing the fraud dataset for the fraud dataset
fraud_df.describe()

Unnamed: 0,id,valor,idade,ip,fraude
count,151112.0,151112.0,151112.0,151112.0,151112.0
mean,200171.04097,36.935372,33.140704,2152145000.0,0.093646
std,115369.285024,18.322762,8.617733,1248497000.0,0.291336
min,2.0,9.0,18.0,52093.5,0.0
25%,100642.5,22.0,27.0,1085934000.0,0.0
50%,199958.0,35.0,33.0,2154770000.0,0.0
75%,300054.0,49.0,39.0,3243258000.0,0.0
max,400000.0,154.0,76.0,4294850000.0,1.0


In [6]:
# checking out the data types for the IP dataset
ISP_df.dtypes

limite_inferior_ip    float64
limite_superior_ip      int64
pais                   object
dtype: object

In [7]:
ISP_df.head()

Unnamed: 0,limite_inferior_ip,limite_superior_ip,pais
0,16777216.0,16777471,Australia
1,16777472.0,16777727,China
2,16777728.0,16778239,China
3,16778240.0,16779263,Australia
4,16779264.0,16781311,China


In [8]:
# upper and lower IP limits are in different data types.
# checking whether that was merely a dot by the end.

ip_floats = ISP_df.limite_inferior_ip.apply(lambda x: not x.is_integer()).sum()
print('Non-zero decimals in lower IP limits: {}'.format(ip_floats))

Non-zero decimals in lower IP limits: 0


In [9]:
# turning upper IP limit to a float64 so there'll be no errors in comparing the two datasets
ISP_df.limite_superior_ip = ISP_df.limite_superior_ip.astype(np.float64)
ISP_df.dtypes

limite_inferior_ip    float64
limite_superior_ip    float64
pais                   object
dtype: object

In [10]:
# checking for null values for the IP dataset
for col in ISP_df.columns:
    print('Null values in {}: {}'.format(col, ISP_df[col].isnull().sum()))

Null values in limite_inferior_ip: 0
Null values in limite_superior_ip: 0
Null values in pais: 0


In [11]:
# describing the fraud dataset for the IP dataset
ISP_df.describe()

Unnamed: 0,limite_inferior_ip,limite_superior_ip
count,138846.0,138846.0
mean,2724532000.0,2724557000.0
std,897521500.0,897497900.0
min,16777220.0,16777470.0
25%,1919930000.0,1920008000.0
50%,3230887000.0,3230888000.0
75%,3350465000.0,3350466000.0
max,3758096000.0,3758096000.0


In [12]:
# strange value at 75% percentile for the lower limit,
# checking whether it is merely a display issue

display(ISP_df[ISP_df.limite_inferior_ip == ISP_df.limite_inferior_ip.max()])
display(ISP_df[ISP_df.limite_superior_ip == ISP_df.limite_superior_ip.max()])

Unnamed: 0,limite_inferior_ip,limite_superior_ip,pais
138845,3758096000.0,3758096000.0,Australia


Unnamed: 0,limite_inferior_ip,limite_superior_ip,pais
138845,3758096000.0,3758096000.0,Australia


In [None]:
# defining which country an IP belongs to
def ip_to_country(ip):
    tmp = ISP_df[(ISP_df.limite_inferior_ip <= ip)
            & (ISP_df.limite_superior_ip >= ip)].pais
    if tmp.shape[0] == 1:
        return tmp.iloc[0]

fraud_df["pais"] = fraud_df.ip.apply(ip_to_country)
fraud_df.head()

In [None]:
fraud_df.to_csv("fraud_data_country.csv", index=False)

# Exploratory Data Analysis

In [None]:
# importing necessary libraries

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# recreating the fraud dataset with country data

fraud_df = pd.read_csv("fraud_data_country.csv", header=0)
fraud_df.head()