In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# TransactionID is the key column. We define it as the index column for ease of use.
identity_train = \
    pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv', index_col='TransactionID')
identity_test = \
    pd.read_csv('/kaggle/input/ieee-fraud-detection/test_identity.csv', index_col='TransactionID')
transaction_train = \
    pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv', index_col='TransactionID')
transaction_test = \
    pd.read_csv('/kaggle/input/ieee-fraud-detection/test_transaction.csv', index_col='TransactionID')

In this project we are trying to predict whether an online transaction is fraudulent or not. The IEEE Fraud detection dataset is broken down into two files 'identity' and 'transaction' data which are joined by the transactionId. It can also be seen that the data is unbalanced because only 3.5% of the transactions are fraudulent.

In [3]:
train_set = transaction_train.merge(identity_train, how = 'left',left_index = True,right_index=True )
test_set = transaction_test.merge(identity_test, how = 'left',left_index = True,right_index=True )

In [4]:
import seaborn as sns
fig, ax  = plt.subplots(figsize=(8, 5))
sns.countplot(x='isFraud', data=train_set, ax=ax)
ax.set_title('Fraud Transaction', fontsize=18)
ax.set_xlabel('Is Fraud', fontsize=14)
ax.set_ylabel('Count', fontsize=14)
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height,
            f'{height/train_set.shape[0] * 100:.2f}%',
            ha='center', fontsize=12)

The TransactionDT feature is a timedelta from a given reference datetime (and not the actual timestamp).The transaction date of the test set is bigger than that of the train set. It implies that the test set is latter in time than the train set and there is no overlap between the two, there is around a one-moth gap in between training and test set.The timespan of the dataset is around 1-year.

In [5]:
print("Mininum date of training set:",min(train_set['TransactionDT']))
print("Maximum date of training set:",max(train_set['TransactionDT']))
print("Minimum date of test set:",min(test_set['TransactionDT']))
print("Maximum date of test set:",max(test_set['TransactionDT']))
print("Timespan of dataset in years:",(max(test_set['TransactionDT'])-min(train_set['TransactionDT']))/(3600*24*365))


In [6]:
fig, ax = plt.subplots(figsize=(15,5))
sns.distplot(train_set['TransactionDT'], kde=False, ax=ax, label='train')
sns.distplot(test_set['TransactionDT'], kde=False, ax=ax, label='test')
ax.set_title('Training and Testset TransanctionDT distribution')
ax.set_ylabel('Frequency', fontsize=14)
ax.legend()

The transaction amount is in USD. From the below plots, it can be seen that the fraudulent transaction amount is generally lower than the non-fraudulent transaction amounts.

In [7]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 5))
fraud_amt = train_set[train_set['isFraud'] == 1]['TransactionAmt'].values
non_fraud_amt = train_set[train_set['isFraud'] == 0]['TransactionAmt'].values
sns.scatterplot(range(train_set[train_set['isFraud'] == 1].shape[0]),
            np.sort(fraud_amt),
            alpha=0.3,
            color='red',
            ax=ax[0])
ax[0].set_title("Transaction Amount-Fraud")
ax[0].set_xlabel("index")
ax[0].set_ylabel("Amount")

sns.scatterplot(range(train_set[train_set['isFraud'] == 0].shape[0]),
            np.sort(non_fraud_amt),
            alpha=0.3,
            ax=ax[1])
ax[1].set_title("Transaction Amount-Non Fraud")
ax[1].set_xlabel("index")
ax[1].set_ylabel("Amount");
fig.tight_layout()


If we consider the product code for all the transactions, it can be seen that maximum transactions are for product code 'W'. The highest number of frauds are products with code 'C' followed by S and then H

In [8]:
product_grouped = train_set.groupby('ProductCD')['isFraud'].value_counts(normalize=True)
product_grouped = product_grouped.mul(100).rename('Percent').reset_index()
fraud_products = product_grouped[product_grouped['isFraud']==1]
plt.bar(fraud_products['ProductCD'], fraud_products['Percent'])
plt.rcParams["figure.figsize"] = (10,10)
plt.xlabel("Product Code")
plt.ylabel("Percentage")

From the dataset, the card 4 column mentions the category the card belongs to. 97.2% of the data belongs to either mastercard or visa. It can also be seen that out of all the card types, the discover card type has the maximum number of frauds followed by mastercard and visa.

In [9]:
card_grouped = train_set.groupby('card4')['isFraud'].value_counts(normalize=True)
card_grouped = card_grouped.mul(100).rename('Percent').reset_index()
fraud_cards = card_grouped[card_grouped['isFraud']==1]
plt.bar(fraud_cards['card4'], fraud_cards['Percent'])
plt.rcParams["figure.figsize"] = (10,10)
plt.xlabel("Card 4 Category")
plt.ylabel("Percentage")

For the Address values in the dataset 'addr1' and 'addr2' , for addr1 around 83% of the data is in the highest 35 unique values. The rest of the unique values can be grouped into one category called others. Similarly, for addr2, around 88% of data in addr2 is cluster in 5 unique values. Values with less than 80 entries will be grouped into category "Others". It can be seen that the maximum number of frauds occur in low frequency groups than higher frequency groups.

In [10]:
print((train_set['addr1'].value_counts().head(35) / train_set['addr1'].shape[0] * 100).sum())
print((train_set['addr2'].value_counts().head(35) / train_set['addr2'].shape[0] * 100).sum())

In [11]:
train_set.loc[train_set.addr1.isin(train_set.addr1.value_counts()[train_set.addr1.value_counts() <= 5000 ].index),'addr1'] = "Others"
train_set.loc[train_set.addr2.isin(train_set.addr2.value_counts()[train_set.addr2.value_counts() <= 50 ].index),'addr2'] = "Others"

In [12]:
addr1_grouped = train_set.groupby('addr1')['isFraud'].value_counts(normalize=True)
addr1_grouped = addr1_grouped.mul(100).rename('Percent').reset_index()
fraud_addr1 = addr1_grouped[addr1_grouped['isFraud']==1]
fraud_addr1 = fraud_addr1.astype({"addr1": 'str'})
plt.bar(fraud_addr1['addr1'], fraud_addr1['Percent'])
plt.rcParams["figure.figsize"] = (10,10)
plt.xlabel("Addr 1")
plt.ylabel("Percentage Fraud")

In [13]:
def ploting_cnt_amt(df, col, lim=2000):
    tmp = pd.crosstab(df[col], df['isFraud'], normalize='index') * 100
    tmp = tmp.reset_index()
    tmp.rename(columns={0:'NoFraud', 1:'Fraud'}, inplace=True)
    total = len(df)
    
    plt.figure(figsize=(16,14))    
    plt.suptitle(f'{col} Distributions ', fontsize=24)
    
    plt.subplot(211)
    g = sns.countplot( x=col,  data=df, order=list(tmp[col].values))
    gt = g.twinx()
    gt = sns.pointplot(x=col, y='Fraud', data=tmp, order=list(tmp[col].values),
                       color='black', legend=False, )
    gt.set_ylim(0,tmp['Fraud'].max()*1.1)
    gt.set_ylabel("%Fraud Transactions", fontsize=16)
    g.set_title(f"Most Frequent {col} values and % Fraud Transactions", fontsize=20)
    g.set_xlabel(f"{col} Category Names", fontsize=16)
    g.set_ylabel("Count", fontsize=17)
    g.set_xticklabels(g.get_xticklabels(),rotation=45)
    sizes = []
    for p in g.patches:
        height = p.get_height()
        sizes.append(height)
        g.text(p.get_x()+p.get_width()/2.,
                height + 3,
                '{:1.2f}%'.format(height/total*100),
                ha="center",fontsize=12) 
        
    g.set_ylim(0,max(sizes)*1.15)
    plt.show()

In [14]:
ploting_cnt_amt(train_set, 'P_emaildomain')

In [15]:
addr2_grouped = train_set.groupby('addr2')['isFraud'].value_counts(normalize=True)
addr2_grouped = addr2_grouped.mul(100).rename('Percent').reset_index()
fraud_addr2 = addr2_grouped[addr2_grouped['isFraud']==1]
fraud_addr2 = fraud_addr2.astype({"addr2": 'str'})
plt.bar(fraud_addr2['addr2'], fraud_addr2['Percent'])
plt.rcParams["figure.figsize"] = (10,10)
plt.xlabel("Addr 2")
plt.ylabel("Percentage Fraud")

In [16]:
pemail_grouped = train_set.groupby('P_emaildomain')['isFraud'].value_counts(normalize=True)
pemail_grouped = pemail_grouped.mul(100).rename('Percent').reset_index()
fraud_pemail = pemail_grouped[pemail_grouped['isFraud']==1]
plt.bar(fraud_pemail['P_emaildomain'], fraud_pemail['Percent'])
plt.rcParams["figure.figsize"] = (200,10)
plt.xlabel("P_emaildomain")
plt.ylabel("Percentage Fraud")

In [17]:
remail_grouped = train_set.groupby('R_emaildomain')['isFraud'].value_counts(normalize=True)
remail_grouped = remail_grouped.mul(100).rename('Percent').reset_index()
fraud_remail = remail_grouped[remail_grouped['isFraud']==1]
plt.bar(fraud_remail['R_emaildomain'], fraud_remail['Percent'])
plt.rcParams["figure.figsize"] = (200,10)
plt.xlabel("R_emaildomain")
plt.ylabel("Percentage Fraud")

In [None]:
corr = train_set.iloc[:, 16:30]
fig, ax = plt.subplots(figsize=(15, 15))
sns.heatmap(corr, linewidths=.5, annot=True, ax=ax)