In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Reference**

* Introduction

 1. https://www.kaggle.com/robikscube/ieee-fraud-detection-first-look-and-eda
 2. https://www.kaggle.com/haataa/complete-eda-with-background-knowledge
 
 
* EDA

 1. https://www.kaggle.com/robikscube/ieee-fraud-detection-first-look-and-eda
 2. https://www.kaggle.com/artgor/eda-and-models 
 3. https://www.kaggle.com/nroman/eda-for-cis-fraud-detection
 4. https://www.kaggle.com/jesucristo/fraud-complete-eda
 5. https://www.kaggle.com/kabure/extensive-eda-and-modeling-xgb-hyperopt
 6. https://www.kaggle.com/rajeshcv/understanding-v-columns

# **Introduction**

* **Why We Should Care About Payment Fraud?**

Payment card fraud is a serious and long-term threat to society with an economic impact forecast to be $416bn in 2017.

Besides financial losses, it has been identified that criminal enterprises and Organised Crime Groups (OCGs) use payment card fraud to fund their activities including arms, drugs and terrorism. The activities of these criminals include violence and murder--individual acts of fraud have a human cost.

Fraud is increasing dramatically with the progression of modern technology and global communication. As a result, fighting fraud has become an important issue to be explored. As presented in the following figure, the detection and prevention mechanisms are used mostly to combat fraud.



* **What is Fraud Detection ?**

Fraud detection tries to discover and identify fraudulent activities as they enter the systems and report them to a system administrator


* **Credit card fraud detection**

Mostly, the strategy of credit card fraud detection is pattern recognition by analyzing user spending behavior automatically. Customer spending behavior contains information about the transaction amount, time gap since last purchase, day of the week, item category, customer address, etc. Anomaly based fraud detection is mostly used for credit card fraud detection system in which the cardholder's profile is made up by analyzing the cardholder spending behavior pattern. In doing so, any incoming transaction that is inconsistent with the cardholder's profile would be considered as suspicious


* **From the competition overview**

In this competition, you’ll benchmark machine learning models on a challenging large-scale dataset. The data comes from Vesta's real-world e-commerce transactions and contains a wide range of features from device type to product features. You also have the opportunity to create new features to improve your results.

# **EDA for CIS Fraud Detection**

* **Data loading and overview**


In the competition you are predicting the probability that an online transaction is fraudulent, as denoted by the binary target isFraud.

Data is separated into two datasets: information about the identity of the customer and transaction information. Not all transactions belong to identities, which are available. Maybe it would be possible to use additional transactions to generate new features.

1. **Import necessary librairies for EDA**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import warnings
warnings.simplefilter("ignore")
plt.style.use('ggplot')
color_pal = [x['color'] for x in plt.rcParams['axes.prop_cycle']]

2. **Data road**

In [None]:
train_transaction = pd.read_csv('../input/ieee-fraud-detection/train_transaction.csv')
test_transaction = pd.read_csv('../input/ieee-fraud-detection/test_transaction.csv')
train_identity = pd.read_csv('../input/ieee-fraud-detection/train_identity.csv')
test_identity = pd.read_csv('../input/ieee-fraud-detection/test_identity.csv')

In [None]:
train_transaction.shape, test_transaction.shape, train_identity.shape,  test_identity.shape

In [None]:
train_transaction.head()

In [None]:
test_transaction.head()

In [None]:
train_transaction.info(), test_transaction.info()

Test data and train data have **393 columns** excluding isFraud data. Based on the TransactionDT, the test data is assumed to have been created after the train data.

In [None]:
train_identity.head()

In [None]:
test_identity.head()

Identity CSVs - These will be merged onto the transactions to create additional features

In [None]:
train_transaction.isnull().sum()

In [None]:
train_identity.isnull().sum()

In [None]:
test_transaction.isnull().sum()

In [None]:
test_identity.isnull().sum()

All datasets have missing values, which are interpreted as common in the real world.

In [None]:
train_transaction['TransactionID'].value_counts()

In [None]:
train_identity['TransactionID'].value_counts()

In [None]:
test_transaction['TransactionID'].value_counts()

In [None]:
train_identity['TransactionID'].value_counts()

**Reference**

isin : https://3months.tistory.com/283 \
unique, value_counts : https://rfriend.tistory.com/267

In [None]:
print(np.sum(train_transaction['TransactionID'].isin(train_identity['TransactionID'].unique())))
print(np.sum(test_transaction['TransactionID'].isin(test_identity['TransactionID'].unique())))

24.4% of TransactionIDs in train (144233 / 590540) have an associated train_identity.  \
28.0% of TransactionIDs in test (141907 / 506691) have an associated train_identity.

In [None]:
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode

x = train_transaction['isFraud'].value_counts().index
y = train_transaction['isFraud'].value_counts().values

trace2 = go.Bar(
     x=x ,
     y=y,
     marker=dict(
         color='blue',
         colorscale = 'Viridis',
         reversescale = True
     ),
     name="Imbalance",    
 )
layout = dict(
     title="Data imbalance - isFraud",
     width = 600, height = 400,
     xaxis=go.layout.XAxis(
     automargin=True),
     yaxis=dict(
         showgrid=False,
         showline=False,
         showticklabels=True,
 #         domain=[0, 0.85],
     ), 
)
fig1 = go.Figure(data=[trace2], layout=layout)
iplot(fig1)

As you can see, Most transaction data is non-fraud. Fraud transaction is 3.5%, which is unbalanced. Therefore, attention should be paid to overfitting problems during the analysis process.

* **Transaction DT**

According to official time, TransactionDT feature is a timedelta from a given reference datetime, not a real time stamp. and Most people estimate the start time as December 1, 2017.

In [None]:
import datetime
startdate = datetime.datetime.strptime('2017-12-01', '%Y-%m-%d')
train['TransactionDT'] = train['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x)))
test['TransactionDT'] = test['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x)))

**Reference**

set_index : https://kongdols-room.tistory.com/123 \
resample : https://rfriend.tistory.com/494

In [None]:
fig, axes = plt.subplots(1, 1, figsize=(16, 6))
train.set_index('TransactionDT').resample('D').mean()['isFraud'].plot(ax=axes, color='blue').set_ylabel('isFraud mean', fontsize=14);
axes.set_title('Mean of isFraud by day', fontsize=16);

In [None]:
fig, axes = plt.subplots(1, 1, figsize=(16, 6))
train['TransactionDT'].dt.floor('d').value_counts().sort_index().plot(ax=axes, color='blue').set_xlabel('Date', fontsize=14);
test['TransactionDT'].dt.floor('d').value_counts().sort_index().plot(ax=axes, color='tab:orange').set_ylabel('Number of training examples', fontsize=14);
axes.set_title('Number of training examples by day', fontsize=16);
axes.legend(['Train', 'Test']);

In [None]:
fig, ax1 = plt.subplots(figsize=(16, 6))
train.set_index('TransactionDT').resample('D').mean()['isFraud'].plot(ax=ax1, color='blue')
ax1.tick_params(axis='y', labelcolor='blue')
ax1.set_ylabel('isFraud mean', color='blue', fontsize=14)
ax2 = ax1.twinx()
train['TransactionDT'].dt.floor('d').value_counts().sort_index().plot(ax=ax2, color='tab:orange');
ax2.tick_params(axis='y', labelcolor='tab:orange');
ax2.set_ylabel('Number of training examples', color='tab:orange', fontsize=14);
ax2.grid(False)

In [None]:
ax = train.plot(x='TransactionDT',
                       y='TransactionAmt',
                       kind='scatter',
                       alpha=0.01,
                       label='TransactionAmt-train',
                       title='Train and test Transaction Ammounts by Time (TransactionDT)',
                       color='blue',
                       ylim=(0, 5000),
                       figsize=(15, 5))
test.plot(x='TransactionDT',
                      y='TransactionAmt',
                      kind='scatter',
                      label='TransactionAmt-test',
                      alpha=0.01,
                      color='tab:orange',
                       ylim=(0, 5000),
                      ax=ax)

train.loc[train_transaction['isFraud'] == 1] \
    .plot(x='TransactionDT',
         y='TransactionAmt',
         kind='scatter',
         alpha=0.01,
         label='TransactionAmt-train',
         title='Train and test Transaction Ammounts by Time (TransactionDT)',
         ylim=(0, 5000),
         color='yellow',
         figsize=(15, 5),
         ax=ax)
plt.show()

* **TransactionAmt**

Data representing the amount of transactions. To avoid skwness of the transaction distribution, it is represented using log transformations Because of the log transfrom, any values between 0 and 1 will appear to be negative.

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(18,6))

time_val = train['TransactionAmt'].values

sns.distplot(time_val, ax=ax[0], color='blue')
ax[0].set_title('Distribution of TransactionAmt', fontsize=14)
ax[1].set_xlim([min(time_val), max(time_val)])

sns.distplot(np.log(time_val), ax=ax[1], color='tab:orange')
ax[1].set_title('Distribution of LOG TransactionAmt', fontsize=14)
ax[1].set_xlim([min(np.log(time_val)), max(np.log(time_val))])

plt.show()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(18,6))

time_val = train_transaction.loc[train_transaction['isFraud'] == 1]['TransactionAmt'].values

sns.distplot(np.log(time_val), ax=ax[0], color='blue')
ax[0].set_title('Distribution of LOG TransactionAmt, isFraud=1', fontsize=14)
ax[1].set_xlim([min(np.log(time_val)), max(np.log(time_val))])

time_val = train_transaction.loc[train_transaction['isFraud'] == 0]['TransactionAmt'].values

sns.distplot(np.log(time_val), ax=ax[1], color='tab:orange')
ax[1].set_title('Distribution of LOG TransactionAmt, isFraud=0', fontsize=14)
ax[1].set_xlim([min(np.log(time_val)), max(np.log(time_val))])


plt.show()

Fraudulent charges appear to have a higher average transaction ammount

* **ProductCD**

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,5))

sns.countplot(x="ProductCD", ax=ax[0], hue = "isFraud", data=train)
ax[0].set_title('ProductCD train', fontsize=14)
sns.countplot(x="ProductCD", ax=ax[1], data=test)
ax[1].set_title('ProductCD test', fontsize=14)
plt.show()

In [None]:
def calcul(val):
    X = train[train['ProductCD'] == val]['ProductCD'].value_counts()
    Y = train[(train['ProductCD'] == val) & (train['isFraud'] == 1)]['ProductCD'].value_counts()
    return np.around(Y/X * 100)

In [None]:
calcul('W')

In [None]:
calcul('C')

For now we don't know exactly what these values represent.

W has the most number of observations, S the least.

ProductCD C has the most fraud with >12%

ProductCD W has the least with ~2%

* **Card**

**card1**

In [None]:
train['card1'].isnull().sum()

Missing values do not exist in this data.

In [None]:
train['card1'].nunique()

In [None]:
plt.figure(figsize=(14, 6))
sns.kdeplot(train[train['isFraud']==1]['card1'], label='isFraud 1', color = 'blue');
sns.kdeplot(train[train['isFraud']==0]['card1'], label='isFraud 0', color = 'tab:orange');

As you can see, Card1 column is given as Categorical but it is behaving like Continuous Data. It has 13553 unique values.

**card2**

In [None]:
train['card2'].isnull().sum()

In [None]:
train['card2'].nunique()

In [None]:
plt.figure(figsize=(14, 6))
sns.kdeplot(train[train['isFraud']==1]['card2'], label='isFraud 1', color = 'blue');
sns.kdeplot(train[train['isFraud']==0]['card2'], label='isFraud 0', color = 'tab:orange');

**card3**

In [None]:
train['card3'].isnull().sum()

In [None]:
train['card3'].nunique()

In [None]:
plt.figure(figsize=(14, 6))
sns.kdeplot(train[train['isFraud']==1]['card3'], label='isFraud 1', color = 'blue');
sns.kdeplot(train[train['isFraud']==0]['card3'], label='isFraud 0', color = 'tab:orange');

In [None]:
train.loc[train.card3.isin(train.card3.value_counts()[train.card3.value_counts() < 200].index), 'card3'] = "Others"
train.loc[train.card5.isin(train.card5.value_counts()[train.card5.value_counts() < 300].index), 'card5'] = "Others"

in Card 3, as we have many values with low frequencies, I decided to set value to "Others". Also, in Card 3 I set the % of Fraud ratio in yaxis2


**Reference**

crosstab : https://twinstarinfo.blogspot.com/2018/10/python-pandascrosstab.html \
twinx : https://www.delftstack.com/ko/howto/matplotlib/how-to-add-y-axis-label-to-secondary-y-axis-in-matplotlib/

In [None]:
tmp = pd.crosstab(train['card3'], train['isFraud'], normalize='index') * 100
tmp = tmp.reset_index()
tmp.rename(columns={0:'NoFraud', 1:'Fraud'}, inplace=True)
total = len(train_transaction)

plt.figure(figsize=(18, 6))
g2 = sns.countplot(x = 'card3', data=train, order=list(tmp.card3.values))
g22 = g2.twinx()
gg2 = sns.pointplot(x='card3', y='Fraud', data=tmp, 
                    color='black', order=list(tmp.card3.values))
gg2.set_ylabel("% of Fraud Transactions", fontsize=16)
g2.set_title("Card 3 Values Distribution and % of Transaction Frauds", fontsize=20)
g2.set_xlabel("Card 3 Values", fontsize=18)
g2.set_ylabel("Count", fontsize=18)
for p in g2.patches:
    height = p.get_height()
    g2.text(p.get_x()+p.get_width()/2.,
            height + 25,
            '{:1.2f}%'.format(height/total*100),
            ha="center") 
plt.show()

In Card3 we can see that 150 and 185 are the most common values in the column.
We have 9.54% of Frauds in 185. The values with highest Fraud Transactions are 185, 119 and 144.

**card4**

In [None]:
train['card4'].isnull().sum()

In [None]:
train['card4'].nunique()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(25,10))

sns.countplot(x="card4", ax=ax[0], data=train_transaction.loc[train_transaction['isFraud'] == 0])
ax[0].set_title('card4 isFraud=0', fontsize=14)
sns.countplot(x="card4", ax=ax[1], data=train_transaction.loc[train_transaction['isFraud'] == 1])
ax[1].set_title('card4 isFraud=1', fontsize=14)

The most fraudulent transactions were detected on the visa card, with the least American express.


**card5**

In [None]:
train['card5'].isnull().sum()

In [None]:
train['card5'].nunique()

In [None]:
tmp2 = pd.crosstab(train['card5'], train['isFraud'], normalize='index') * 100
tmp2 = tmp2.reset_index()
tmp2.rename(columns={0:'NoFraud', 1:'Fraud'}, inplace=True)
total = len(train_transaction)

plt.figure(figsize=(18, 6))
g3 = sns.countplot(x='card5', data=train, order=list(tmp2.card5.values))
g3t = g3.twinx()
g3t = sns.pointplot(x='card5', y='Fraud', data=tmp2, 
                    color='black', order=list(tmp2.card5.values))
g3t.set_ylabel("% of Fraud Transactions", fontsize=16)
g3.set_title("Card 5 Values Distribution and % of Transaction Frauds", fontsize=20)
g3.set_xticklabels(g3.get_xticklabels(),rotation=90)
g3.set_xlabel("Card 5 Values", fontsize=18)
g3.set_ylabel("Count", fontsize=18)
for p in g3.patches:
    height = p.get_height()
    g3.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{:1.2f}%'.format(height/total*100),
            ha="center",fontsize=11) 
    
plt.subplots_adjust(hspace = 0.6, top = 0.85)


plt.show()

In Card5 the most frequent values are 226, 224, 166 that represents 73% of data. Also is posible to see high % of frauds in 137, 147, 141 that has few entries for values.

In [None]:
train['card6'].isnull().sum()

In [None]:
train['card6'].nunique()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(25,10))

sns.countplot(x="card6", ax=ax[0], data=train_transaction.loc[train_transaction['isFraud'] == 0])
ax[0].set_title('card6 isFraud=0', fontsize=14)
sns.countplot(x="card6", ax=ax[1], data=train_transaction.loc[train_transaction['isFraud'] == 1])
ax[1].set_title('card6 isFraud=1', fontsize=14)

There were no fraudulent transaction data except debit and credit cards, and debit cards had more normal transactions than credit cards, but there were also more fraudulent transactions.

* **addr**

According to the name of the feature we can assume that it contains some kind of users address, but in an encoded way. and The data description states that these are categorical even though they look numeric. 

In [None]:
addr_cols = [a for a in train.columns if 'addr' in a]
train[addr_cols].head()

In [None]:
train['addr1'].plot(kind='hist',
                                bins=50,
                                figsize=(15, 2),
                                title='addr1',
                                color = 'blue')
plt.show()
train['addr2'].plot(kind='hist',
                                bins=50,
                                figsize=(15, 2),
                                title='addr2',
                                color = 'blue')
plt.show()

In [None]:
train.loc[train.addr1.isin(train.addr1.value_counts()[train.addr1.value_counts() <= 5000 ].index), 'addr1'] = "Others"
train.loc[train.addr2.isin(train.addr2.value_counts()[train.addr2.value_counts() <= 50 ].index), 'addr2'] = "Others"

In [None]:
total_amt = train.groupby(['isFraud'])['TransactionAmt'].sum().sum()

def ploting_cnt_amt(df, col, lim=2000):
    tmp = pd.crosstab(df[col], df['isFraud'], normalize='index') * 100
    tmp = tmp.reset_index()
    tmp.rename(columns={0:'NoFraud', 1:'Fraud'}, inplace=True)
    
    plt.figure(figsize=(16,14))    
    plt.suptitle(f'{col} Distributions ', fontsize=24)
    
    plt.subplot(211)
    g = sns.countplot( x=col,  data=df, order=list(tmp[col].values))
    gt = g.twinx()
    gt = sns.pointplot(x=col, y='Fraud', data=tmp, order=list(tmp[col].values),
                       color='black', legend=False, )
    gt.set_ylim(0,tmp['Fraud'].max()*1.1)
    gt.set_ylabel("%Fraud Transactions", fontsize=16)
    g.set_title(f"Most Frequent {col} values and % Fraud Transactions", fontsize=20)
    g.set_xlabel(f"{col} Category Names", fontsize=16)
    g.set_ylabel("Count", fontsize=17)
    g.set_xticklabels(g.get_xticklabels(),rotation=45)
    sizes = []
    for p in g.patches:
        height = p.get_height()
        sizes.append(height)
        g.text(p.get_x()+p.get_width()/2.,
                height + 3,
                '{:1.2f}%'.format(height/total*100),
                ha="center",fontsize=12) 
        
    g.set_ylim(0,max(sizes)*1.15)
    
    #########################################################################
    perc_amt = (df.groupby(['isFraud',col])['TransactionAmt'].sum() \
                / df.groupby([col])['TransactionAmt'].sum() * 100).unstack('isFraud')
    perc_amt = perc_amt.reset_index()
    perc_amt.rename(columns={0:'NoFraud', 1:'Fraud'}, inplace=True)
    amt = df.groupby([col])['TransactionAmt'].sum().reset_index()
    perc_amt = perc_amt.fillna(0)
    plt.subplot(212)
    g1 = sns.barplot(x=col, y='TransactionAmt', 
                       data=amt, 
                       order=list(tmp[col].values))
    g1t = g1.twinx()
    g1t = sns.pointplot(x=col, y='Fraud', data=perc_amt, 
                        order=list(tmp[col].values),
                       color='black', legend=False, )
    g1t.set_ylim(0,perc_amt['Fraud'].max()*1.1)
    g1t.set_ylabel("%Fraud Total Amount", fontsize=16)
    g.set_xticklabels(g.get_xticklabels(),rotation=45)
    g1.set_title(f"{col} by Transactions Total + %of total and %Fraud Transactions", fontsize=20)
    g1.set_xlabel(f"{col} Category Names", fontsize=16)
    g1.set_ylabel("Transaction Total Amount(U$)", fontsize=16)
    g1.set_xticklabels(g.get_xticklabels(),rotation=45)    
    
    for p in g1.patches:
        height = p.get_height()
        g1.text(p.get_x()+p.get_width()/2.,
                height + 3,
                '{:1.2f}%'.format(height/total_amt*100),
                ha="center",fontsize=12) 
        
    plt.subplots_adjust(hspace=.4, top = 0.9)
    plt.show()
    
ploting_cnt_amt(train, 'addr1')

In [None]:
ploting_cnt_amt(train, 'addr2')

Almost all entries in Addr2 are in the same value.
Interestingly in the value 65 , the percent of frauds are almost 60%
Altought the value 87 has 88% of total entries, it has 96% of Total Transaction Amounts

* **dist**

Perhaps this could be the distance between the cardholder's home/work address and the transaction.

In [None]:
dist_cols = [d for d in train.columns if 'dist' in d]
train[dist_cols].head()

In [None]:
train['dist1'].isna().sum(), train['dist2'].isna().sum()

It can be seen that dist1-2 contains many missing values.


In [None]:
train['dist1'].plot(kind='hist',
                                bins=5000,
                                figsize=(15, 2),
                                title='dist1 distribution',
                                color='blue',
                                logx=True)
plt.show()
train['dist2'].plot(kind='hist',
                                bins=5000,
                                figsize=(15, 2),
                                title='dist2 distribution',
                                color= 'blue',
                                logx=True)
plt.show()


Use logx to plot the distribution better.

* **emaildomain**

**P emaildomain**

I will group all e-mail domains by the respective enterprises.
Also, I will set as "Others" all values with less than 500 entries.

In [None]:
train.loc[train['P_emaildomain'].isin(['gmail.com', 'gmail']),'P_emaildomain'] = 'Google'

train.loc[train['P_emaildomain'].isin(['yahoo.com', 'yahoo.com.mx',  'yahoo.co.uk',
                                         'yahoo.co.jp', 'yahoo.de', 'yahoo.fr',
                                         'yahoo.es']), 'P_emaildomain'] = 'Yahoo Mail'
train.loc[train['P_emaildomain'].isin(['hotmail.com','outlook.com','msn.com', 'live.com.mx', 
                                         'hotmail.es','hotmail.co.uk', 'hotmail.de',
                                         'outlook.es', 'live.com', 'live.fr',
                                         'hotmail.fr']), 'P_emaildomain'] = 'Microsoft'
train.loc[train.P_emaildomain.isin(train.P_emaildomain\
                                         .value_counts()[train.P_emaildomain.value_counts() <= 500 ]\
                                         .index), 'P_emaildomain'] = "Others"
train.P_emaildomain.fillna("NoInf", inplace=True)

In [None]:
ploting_cnt_amt(train, 'P_emaildomain')

**Distributions**

I will group all e-mail domains by the respective enterprises.
I will set as "Others" all values with less than 300 entries.

In [None]:
train.loc[train['R_emaildomain'].isin(['gmail.com', 'gmail']),'R_emaildomain'] = 'Google'

train.loc[train['R_emaildomain'].isin(['yahoo.com', 'yahoo.com.mx',  'yahoo.co.uk',
                                             'yahoo.co.jp', 'yahoo.de', 'yahoo.fr',
                                             'yahoo.es']), 'R_emaildomain'] = 'Yahoo Mail'
train.loc[train['R_emaildomain'].isin(['hotmail.com','outlook.com','msn.com', 'live.com.mx', 
                                             'hotmail.es','hotmail.co.uk', 'hotmail.de',
                                             'outlook.es', 'live.com', 'live.fr',
                                             'hotmail.fr']), 'R_emaildomain'] = 'Microsoft'
train.loc[train.R_emaildomain.isin(train.R_emaildomain\
                                         .value_counts()[train.R_emaildomain.value_counts() <= 300 ]\
                                         .index), 'R_emaildomain'] = "Others"
train.R_emaildomain.fillna("NoInf", inplace=True)

In [None]:
ploting_cnt_amt(train, 'R_emaildomain')

The most transactions were made through gmail, and fraudulent transaction detection was also the most common in gmail. What's unusual is that iCloud has a high value.

* **C1~C14**

In [None]:
from scipy import stats

def resumetable(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    summary['Third Value'] = df.loc[2].values

    for name in summary['Name'].value_counts().index:
        summary.loc[summary['Name'] == name, 'Entropy'] = round(stats.entropy(df[name].value_counts(normalize=True), base=2),2) 

    return summary

In [None]:
c_cols = [c for c in train if c[0] == 'C']
train[c_cols].head()

In [None]:
resumetable(train[c_cols])

In [None]:
train[c_cols].describe()

In [None]:
train.loc[train.C1.isin(train.C1\
                              .value_counts()[train.C1.value_counts() <= 400 ]\
                              .index), 'C1'] = "Others"

In [None]:
ploting_cnt_amt(train, 'C1')

In [None]:
train.loc[train.C2.isin(train.C2\
                              .value_counts()[train.C2.value_counts() <= 350 ]\
                              .index), 'C2'] = "Others"

In [None]:
ploting_cnt_amt(train, 'C2')

In [None]:
train.loc[train.C4.isin(train.C4\
                              .value_counts()[train.C4.value_counts() <= 400 ]\
                              .index), 'C4'] = "Others"

In [None]:
ploting_cnt_amt(train, 'C4')

* **D1~D5**

In [None]:
d_cols = [c for c in train if (c[0] == 'D') and (c[1] != 'e')]
train[d_cols].head()

In [None]:
resumetable(train[d_cols])

In [None]:
train[d_cols].describe()

In [None]:
train['D1'].value_counts()

In [None]:
train.loc[train.D1.isin(train.D1.value_counts()[train.D1.value_counts() <= 2000 ].index), 'D1'] = "Others"

In [None]:
ploting_cnt_amt(train, 'D1')

In [None]:
train.D2.value_counts()

In [None]:
train.loc[train.D2.isin(train.D2.value_counts()[train.D2.value_counts() <= 2000 ].index), 'D2'] = "Others"

In [None]:
ploting_cnt_amt(train, 'D2')

In [None]:
train.D8.value_counts()

In [None]:
train.loc[train.D8.isin(train.D8.value_counts()[train.D8.value_counts() <= 250 ].index), 'D8'] = "Others"

In [None]:
ploting_cnt_amt(train, 'D8')

In [None]:
train['D8'].unique()

* **M1~M9**

In [None]:
m_cols = [c for c in train if c[0] == 'M']
train[m_cols].head()

In [None]:
resumetable(train[m_cols])

In [None]:
train[m_cols].describe()

It is the value of T, F, or NaN except for M4.

In [None]:
train.M1.value_counts()

In [None]:
train.M4.value_counts()

**Reference**

subplots_adjust : https://www.delftstack.com/ko/howto/matplotlib/how-to-improve-subplot-size-or-spacing-with-many-subplots-in-matplotlib/

In [None]:
for col in m_cols:
    train[col] = train[col].fillna("Miss")
    
def ploting_dist_ratio(df, col, lim=2000):
    tmp = pd.crosstab(df[col], df['isFraud'], normalize='index') * 100
    tmp = tmp.reset_index()
    tmp.rename(columns={0:'NoFraud', 1:'Fraud'}, inplace=True)

    plt.figure(figsize=(20,5))
    plt.suptitle(f'{col} Distributions ', fontsize=22)

    plt.subplot(121)
    g = sns.countplot(x=col, data=df, order=list(tmp[col].values))
    # plt.legend(title='Fraud', loc='upper center', labels=['No', 'Yes'])
    g.set_title(f"{col} Distribution\nCound and %Fraud by each category", fontsize=18)
    g.set_ylim(0,400000)
    gt = g.twinx()
    gt = sns.pointplot(x=col, y='Fraud', data=tmp, order=list(tmp[col].values),
                       color='black', legend=False, )
    gt.set_ylim(0,20)
    gt.set_ylabel("% of Fraud Transactions", fontsize=16)
    g.set_xlabel(f"{col} Category Names", fontsize=16)
    g.set_ylabel("Count", fontsize=17)
    for p in gt.patches:
        height = p.get_height()
        gt.text(p.get_x()+p.get_width()/2.,
                height + 3,
                '{:1.2f}%'.format(height/total*100),
                ha="center",fontsize=14) 
        
    perc_amt = (train.groupby(['isFraud',col])['TransactionAmt'].sum() / total_amt * 100).unstack('isFraud')
    perc_amt = perc_amt.reset_index()
    perc_amt.rename(columns={0:'NoFraud', 1:'Fraud'}, inplace=True)

    plt.subplot(122)
    g1 = sns.boxplot(x=col, y='TransactionAmt', hue='isFraud', 
                     data=df[df['TransactionAmt'] <= lim], order=list(tmp[col].values))
    g1t = g1.twinx()
    g1t = sns.pointplot(x=col, y='Fraud', data=perc_amt, order=list(tmp[col].values),
                       color='black', legend=False, )
    g1t.set_ylim(0,5)
    g1t.set_ylabel("%Fraud Total Amount", fontsize=16)
    g1.set_title(f"{col} by Transactions dist", fontsize=18)
    g1.set_xlabel(f"{col} Category Names", fontsize=16)
    g1.set_ylabel("Transaction Amount(U$)", fontsize=16)
        
    plt.subplots_adjust(hspace=.4, wspace = 0.35, top = 0.80)
    
    plt.show()

Since many NaN values exist, we visualize them by replacing them with Miss.

In [None]:
for col in m_cols:
    ploting_dist_ratio(train, col, lim=2500)

Except for M4, the remaining M values show a high fraud detection rate in missing values.

* **V1~V339**

Each of the 339 V columns has a low importance and is usually eliminated. To make V columns more useful, understand this column.

In [None]:
v_cols = [c for c in train if c[0] == 'V']
train[v_cols].head()

In [None]:
resumetable(train[v_cols])

In [None]:
train[v_cols].describe()

In [None]:
len(train.isFraud[train.isFraud==1])/len(train)

In [None]:
# Helper functions
# 1. For calculating % na values in  columns
def percent_na(df):
    percent_missing = df.isnull().sum() * 100 / len(df)
    missing_value_df = pd.DataFrame({'column_groups': percent_missing.index,
                                 'percent_missing': percent_missing.values})
    return missing_value_df
# 2. For plotting grouped histograms 
def sephist(col):
    yes = train_transaction[train_transaction['isFraud'] == 1][col]
    no = train_transaction[train_transaction['isFraud'] == 0][col]
    return yes, no

The first grouping is based on the percentage of missing values in the columns . The columns can be divided into 15 groups as below.

In [None]:
pd.options.display.max_colwidth =300
Vcols=train_transaction.columns[train_transaction.columns.str.startswith('V')]
train_transaction_vcol_na = percent_na(train_transaction[Vcols])
train_transaction_vcol_na_group= train_transaction_vcol_na.groupby('percent_missing')['column_groups'].unique().reset_index()
num_values_per =[]
for i in range(len(train_transaction_vcol_na_group)):
    num_values_per.append(len(train_transaction_vcol_na_group['column_groups'][i]))
train_transaction_vcol_na_group['num_columns_group'] = num_values_per
train_transaction_vcol_na_group

In [None]:
pd.options.display.max_colwidth =300
Vcols=test_transaction.columns[test_transaction.columns.str.startswith('V')]
test_transaction_vcol_na = percent_na(test_transaction[Vcols])
test_transaction_vcol_na_group= test_transaction_vcol_na.groupby('percent_missing')['column_groups'].unique().reset_index()
num_values_per =[]
for i in range(len(test_transaction_vcol_na_group)):
    num_values_per.append(len(test_transaction_vcol_na_group['column_groups'][i]))
test_transaction_vcol_na_group['num_columns_group'] = num_values_per
test_transaction_vcol_na_group

Let's take a look at how each V column creates 96.5% of the data in a non-fraud transaction ratio.

In [None]:
def column_value_freq(sel_col,cum_per):
    dfpercount = pd.DataFrame(columns=['col_name','num_values_'+str(round(cum_per,2))])
    for col in sel_col:
        col_value = train_transaction[col].value_counts(normalize=True)
        colpercount = pd.DataFrame({'value' : col_value.index,'per_count' : col_value.values})
        colpercount['cum_per_count'] = colpercount['per_count'].cumsum()
        if len(colpercount.loc[colpercount['cum_per_count'] < cum_per,] ) < 2:
            num_col_99 = len(colpercount.loc[colpercount['per_count'] > (1- cum_per),])
        else:
            num_col_99 = len(colpercount.loc[colpercount['cum_per_count']< cum_per,] )
        dfpercount=dfpercount.append({'col_name': col,'num_values_'+str(round(cum_per,2)): num_col_99},ignore_index = True)
    dfpercount['unique_values'] = train_transaction[sel_col].nunique().values
    dfpercount['unique_value_to_num_values'+str(round(cum_per,2))+'_ratio'] = 100 * (dfpercount['num_values_'+str(round(cum_per,2))]/dfpercount.unique_values)
    dfpercount['percent_missing'] = percent_na(train_transaction[sel_col])['percent_missing'].round(3).values
    return dfpercount

def column_value_details(sel_col,cum_per):
    dfpercount = pd.DataFrame(columns=['col_name','values_'+str(round(cum_per,2)),'values_'+str(round(1-cum_per,2))])
    for col in sel_col:
        col_value = train_transaction[col].value_counts(normalize=True)
        colpercount = pd.DataFrame({'value' : col_value.index,'per_count' : col_value.values})
        colpercount['cum_per_count'] = colpercount['per_count'].cumsum()
        if len(colpercount.loc[colpercount['cum_per_count'] < cum_per,] ) < 2:
            values_freq = colpercount.loc[colpercount['per_count'] > (1- cum_per),'value'].tolist()
        else:
            values_freq = colpercount.loc[colpercount['cum_per_count']< cum_per,'value'].tolist() 
        values_less_freq =  [item for item in colpercount['value'] if item not in values_freq]
        dfpercount=dfpercount.append({'col_name': col,'values_'+str(round(cum_per,2)) : values_freq ,'values_'+str(round(1-cum_per,2)): values_less_freq},ignore_index = True)
    num_values_per =[]
    for i in range(len(dfpercount)):
        num_values_per.append(len(dfpercount['values_'+str(round(cum_per,2))][i]))
    dfpercount['num_values_per'] = num_values_per
    return dfpercount

In [None]:
def vcol_multiplot(col,cum_per,ax1):
    col_freq = column_value_freq(col,cum_per)      
    plot1=col_freq.plot(x='col_name',y=['unique_values','num_values_'+str(round(cum_per,2))],kind='bar',rot=90,ax = ax1)
    for p in plot1.patches[1:]:
        h = p.get_height()
        x = p.get_x()+p.get_width()/2.
        if h != 0:
            plot1.annotate("%g" % p.get_height(), xy=(x,h), xytext=(0,4), rotation=90, 
                   textcoords="offset points", ha="center", va="bottom")
    plot1.set(ylabel='Count')
    plot1= plot1.set(title='Data Details  in each V columns with ' + str(round(col_freq.percent_missing.mean(),4)) +'% missing values')
    
def vcol_plot(col,cum_per):
    col_freq = column_value_freq(col,cum_per)      
    plot1=col_freq.plot(x='col_name',y=['unique_values','num_values_'+str(round(cum_per,2))],kind='bar',rot=90)
    for p in plot1.patches[1:]:
        h = p.get_height()
        x = p.get_x()+p.get_width()/2.
        if h != 0:
            plot1.annotate("%g" % p.get_height(), xy=(x,h), xytext=(0,4), rotation=90, 
                   textcoords="offset points", ha="center", va="bottom")
    plot1.set(ylabel='Count')
    plot1= plot1.set(title='Data Details  in each V columns with ' + str(round(col_freq.percent_missing.mean(),4)) +'% missing values')

In [None]:
cum_per = 0.965
fig, axs = plt.subplots(2,1, figsize=(15, 16), facecolor='w', edgecolor='k',squeeze=False)
axs=axs.ravel()
vcol_multiplot(train_transaction_vcol_na_group.column_groups[0],cum_per,axs[0])
vcol_multiplot(train_transaction_vcol_na_group.column_groups[1],cum_per,axs[1])

In [None]:
fig, axs = plt.subplots(4,2, figsize=(15,16), facecolor='w', edgecolor='k',squeeze=False)
#fig.subplots_adjust(hspace = 0.75, wspace=.001)
axs = axs.ravel()
for i in range(2,10):
    vcol_multiplot(train_transaction_vcol_na_group.column_groups[i],cum_per,axs[i-2])
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)

In [None]:
fig, axs = plt.subplots(5,1, figsize=(15,16), facecolor='w', edgecolor='k',squeeze=False)
axs=axs.ravel()
vcol_multiplot(train_transaction_vcol_na_group.column_groups[10],cum_per,axs[0])
vcol_multiplot(train_transaction_vcol_na_group.column_groups[11],cum_per,axs[1])
vcol_multiplot(train_transaction_vcol_na_group.column_groups[12],cum_per,axs[2])
vcol_multiplot(train_transaction_vcol_na_group.column_groups[13],cum_per,axs[3])
vcol_multiplot(train_transaction_vcol_na_group.column_groups[14],cum_per,axs[4])
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)

Based on the data distribution columns can be divided into 5 types.

 1. **Boolean** - columns with only two unique values

 2. **Pseudo- Boolean** - columns with 96.5% data covered by maximum two unique values. Within this there are two types.
 
    Pseudo-Boolean-categorical - Columns with 15 or less unique values but 96.5% data covered by  maximum two unique values\
    Pseudo-Boolean-numerical - Columns with more than 15 unique values but 96.5% data covered by  maximum two unique values
    
 3. **Pseudo-Categorical** - Columns with 96.5% data covered by 15 or less unique values

 4. **Numerical** - All Other columns

**Boolean Columns**

In [None]:
colfreq=column_value_freq(Vcols,cum_per)
colfreqbool = colfreq[colfreq.unique_values==2]
if len(colfreqbool)%3 == 0:
    nrow = len(colfreqbool)/3
else:
    nrow = len(colfreqbool) // 3 + 1 
sns.set(rc={'figure.figsize':(14,16)})
for num, alpha in enumerate(colfreqbool.col_name):
    plt.subplot(nrow, 3, num+1)
    plot1= sns.countplot(data=train_transaction,x=alpha,hue='isFraud')
    for p in plot1.patches[1:]:
        h = p.get_height()
        x = p.get_x()+p.get_width()/2.
        if h != 0:
            plot1.annotate("%g" % p.get_height(), xy=(x,h), xytext=(0,4), rotation=90, 
                   textcoords="offset points", ha="center", va="bottom")
    plt.legend(title='isFraud',loc='upper right')
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)

With the exception of V305, it has values of 0 and 1, with most values of 1.

**Pseudo Booleans**

In [None]:
def cum_value_count(col):
    col_value = train_transaction[col].value_counts(normalize=True)
    colpercount = pd.DataFrame({'value' : col_value.index,'per_count' : col_value.values})
    colpercount['cum_per_count'] = colpercount['per_count'].cumsum()
    return colpercount

In [None]:
def V_doublecat_plot(cols,cum_per,limit):
    Vcol_details=column_value_details(cols,cum_per)
    V_cat = Vcol_details[Vcol_details['num_values_per'] <= limit].reset_index()
    sns.set(rc={'figure.figsize':(14,len(V_cat)*2)})
    x=1
    for num, alpha in enumerate(V_cat.col_name):
        plt.subplot(len(V_cat),2,x)
        sns.countplot(data=train_transaction[train_transaction[alpha].isin (V_cat['values_'+str(round(cum_per,2))][num])],y=alpha,hue='isFraud')
        plt.legend(loc='lower right')
        plt.title('Count of unique values which make '+str(round(cum_per*100,3))+'% of data in column ' + str(alpha) )
        plt.subplot(len(V_cat),2,x+1)
        sns.countplot(data=train_transaction[train_transaction[alpha].isin (V_cat['values_'+str(round(1-cum_per,2))][num])],y=alpha,hue='isFraud')
        plt.legend(loc='lower right')
        plt.title('Count of unique values which make only '+str(round((1-cum_per)*100,3))+'% of data in column ' + str(alpha) )
        x= x+2
    plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)

In [None]:
def V_cat_plot(cols,cum_per,limit):
    Vcol_details=column_value_details(cols,cum_per)
    V_cat = Vcol_details[Vcol_details['num_values_per'] <= limit].reset_index()
    sns.set(rc={'figure.figsize':(14,len(V_cat)*2)})
    x=1
    for num, alpha in enumerate(V_cat.col_name):
        plt.subplot(len(V_cat),2,x)
        sns.countplot(data=train_transaction[train_transaction[alpha].isin (V_cat['values_'+str(round(cum_per,2))][num])],y=alpha,hue='isFraud')
        plt.legend(loc='lower right')
        plt.title('Count of unique values which make '+str(round(cum_per*100,3))+'% of data in column ' + str(alpha) )
        plt.subplot(len(V_cat),2,x+1)
        yes = train_transaction[(train_transaction['isFraud'] == 1) & (train_transaction[alpha].isin (V_cat['values_'+str(round(1-cum_per,2))][num]))][alpha]
        no = train_transaction[(train_transaction['isFraud'] == 0) & (train_transaction[alpha].isin (V_cat['values_'+str(round(1-cum_per,2))][num]))][alpha]
        plt.hist(yes, alpha=0.75, label='Fraud', color='r')
        plt.hist(no, alpha=0.25, label='Not Fraud', color='g')
        plt.legend(loc='upper right')
        plt.title('Histogram of values which make '+str(round((1-cum_per)*100,3))+'% of data in column ' + str(alpha) )
        x= x+2
    plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)

In [None]:
def V_num_plot(cols,cum_per,limit):
    Vcol_details=column_value_details(cols,cum_per)
    V_num = Vcol_details[Vcol_details['num_values_per'] > limit].reset_index()
    sns.set(rc={'figure.figsize':(14,len(V_num)*2)})
    x=1
    for num, alpha in enumerate(V_num.col_name):
        plt.subplot(len(V_num),2,x)
        yes = train_transaction[(train_transaction['isFraud'] == 1) & (train_transaction[alpha].isin (V_num['values_'+str(round(cum_per,2))][num]))][alpha]
        no = train_transaction[(train_transaction['isFraud'] == 0) & (train_transaction[alpha].isin (V_num['values_'+str(round(cum_per,2))][num]))][alpha]
        plt.hist(yes, alpha=0.75, label='Fraud', color='r')
        plt.hist(no, alpha=0.25, label='Not Fraud', color='g')
        plt.legend(loc='upper right')
        plt.title('Histogram of  values which make '+str(round(cum_per*100,3))+'% of data in column ' + str(alpha) )
        plt.subplot(len(V_num),2,x+1)
        yes = train_transaction[(train_transaction['isFraud'] == 1) & (train_transaction[alpha].isin (V_num['values_'+str(round(1-cum_per,2))][num]))][alpha]
        no = train_transaction[(train_transaction['isFraud'] == 0) & (train_transaction[alpha].isin (V_num['values_'+str(round(1-cum_per,2))][num]))][alpha]
        plt.hist(yes, alpha=0.75, label='Fraud', color='r')
        plt.hist(no, alpha=0.25, label='Not Fraud', color='g')
        plt.legend(loc='upper right')
        plt.title('Histogram of values which make '+str(round((1-cum_per)*100,3))+'% of data in column ' + str(alpha) )
        x= x+2
    plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)


In [None]:
colfreqpseudobool = colfreq[(colfreq.unique_values !=2) & (colfreq['num_values_'+str(round(cum_per,2))] <= 2)]

In [None]:
pseudoboolcat = colfreqpseudobool[colfreqpseudobool.unique_values <=15]['col_name'].values
V_doublecat_plot(pseudoboolcat,cum_per,15)

It can be seen that some unique values belong to 3.5%.

In [None]:
pseudoboolnum = colfreqpseudobool[colfreqpseudobool.unique_values >15]['col_name'].values


In [None]:
V_cat_plot(pseudoboolnum,cum_per,15)


The histograms of values less than 3.5% of the column data shows a higher proportion of fraud transactions.

In [None]:
colfreqcat = colfreq[(colfreq.unique_values <=15) & (colfreq['num_values_'+str(round(cum_per,2))] > 2)]
colfreqcat

In [None]:
colfreqpseudocat = colfreq[(colfreq.unique_values >15) & (colfreq['num_values_'+str(round(cum_per,2))] <= 15) & (colfreq['num_values_'+str(round(cum_per,2))]> 2)]

In [None]:
V_cat_plot(colfreqpseudocat.col_name,cum_per,15)

In some of these columns a higher proportion of fraud cases are seen for values which form less than 3.5% of the column data

In [None]:
colfreqnum = colfreq[colfreq['num_values_'+str(round(cum_per,2))]>15]

In [None]:
V_num_plot(colfreqnum.col_name,cum_per,15)

It looks like the Pseudo Boolean and Pseudo Categorical columns are important as in both tpes there is a higher proportion of fraud cases when the values fall with less than 3.5% of column data unique values

* **id_1 ~ id_38**

id data including customer ID information.

  * Categorical Features
  * DeviceType
  * DeviceInfo
  * id_12 - id_38

**DeviceType**

In [None]:
id_cols = [c for c in train if c[0] == 'i']
train[id_cols].head()

In [None]:
train[id_cols].describe(include = 'all')

In [None]:
train.groupby('DeviceType') \
    .mean()['isFraud'] \
    .sort_values() \
    .plot(kind='barh',
          figsize=(15, 5),
          title='Percentage of Fraud by Device Type')
plt.show()

In [None]:
train.groupby('DeviceInfo') \
    .count()['TransactionID'] \
    .sort_values(ascending=False) \
    .head(20) \
    .plot(kind='barh', figsize=(15, 5), title='Top 20 Devices in Train')
plt.show()

Ploting columns with few unique id values

In [None]:
def cat_feat_ploting(df, col):
    tmp = pd.crosstab(df[col], df['isFraud'], normalize='index') * 100
    tmp = tmp.reset_index()
    tmp.rename(columns={0:'NoFraud', 1:'Fraud'}, inplace=True)

    plt.figure(figsize=(14,10))
    plt.suptitle(f'{col} Distributions', fontsize=22)

    plt.subplot(221)
    g = sns.countplot(x=col, data=df, order=tmp[col].values)
    # plt.legend(title='Fraud', loc='upper center', labels=['No', 'Yes'])

    g.set_title(f"{col} Distribution", fontsize=19)
    g.set_xlabel(f"{col} Name", fontsize=17)
    g.set_ylabel("Count", fontsize=17)
    # g.set_ylim(0,500000)
    for p in g.patches:
        height = p.get_height()
        g.text(p.get_x()+p.get_width()/2.,
                height + 3,
                '{:1.2f}%'.format(height/total*100),
                ha="center", fontsize=14) 

    plt.subplot(222)
    g1 = sns.countplot(x=col, hue='isFraud', data=df, order=tmp[col].values)
    plt.legend(title='Fraud', loc='best', labels=['No', 'Yes'])
    gt = g1.twinx()
    gt = sns.pointplot(x=col, y='Fraud', data=tmp, color='black', order=tmp[col].values, legend=False)
    gt.set_ylabel("% of Fraud Transactions", fontsize=16)

    g1.set_title(f"{col} by Target(isFraud)", fontsize=19)
    g1.set_xlabel(f"{col} Name", fontsize=17)
    g1.set_ylabel("Count", fontsize=17)

    plt.subplot(212)
    g3 = sns.boxenplot(x=col, y='TransactionAmt', hue='isFraud', 
                       data=df[df['TransactionAmt'] <= 2000], order=tmp[col].values )
    g3.set_title("Transaction Amount Distribuition by ProductCD and Target", fontsize=20)
    g3.set_xlabel("ProductCD Name", fontsize=17)
    g3.set_ylabel("Transaction Values", fontsize=17)

    plt.subplots_adjust(hspace = 0.4, top = 0.85)

    plt.show()

In [None]:
for col in ['id_12', 'id_15', 'id_16', 'id_23', 'id_27', 'id_28', 'id_29']:
    train[col] = train[col].fillna('NaN')
    cat_feat_ploting(train, col)

**id_30**

In [None]:
train.loc[train['id_30'].str.contains('Windows', na=False), 'id_30'] = 'Windows'
train.loc[train['id_30'].str.contains('iOS', na=False), 'id_30'] = 'iOS'
train.loc[train['id_30'].str.contains('Mac OS', na=False), 'id_30'] = 'Mac'
train.loc[train['id_30'].str.contains('Android', na=False), 'id_30'] = 'Android'
train['id_30'].fillna("NaN", inplace=True)

In [None]:
ploting_cnt_amt(train, 'id_30')

**id_31**

In [None]:
train.loc[train['id_31'].str.contains('chrome', na=False), 'id_31'] = 'Chrome'
train.loc[train['id_31'].str.contains('firefox', na=False), 'id_31'] = 'Firefox'
train.loc[train['id_31'].str.contains('safari', na=False), 'id_31'] = 'Safari'
train.loc[train['id_31'].str.contains('edge', na=False), 'id_31'] = 'Edge'
train.loc[train['id_31'].str.contains('ie', na=False), 'id_31'] = 'IE'
train.loc[train['id_31'].str.contains('samsung', na=False), 'id_31'] = 'Samsung'
train.loc[train['id_31'].str.contains('opera', na=False), 'id_31'] = 'Opera'
train['id_31'].fillna("NaN", inplace=True)
train.loc[train.id_31.isin(train.id_31.value_counts()[train.id_31.value_counts() < 200].index), 'id_31'] = "Others"

In [None]:
ploting_cnt_amt(train, 'id_31')

In [None]:
train['id_31']