# Features Engineering

In [8]:
# import library
import pandas as pd

import warnings
warnings.simplefilter("ignore")
    
# read dataset
df = pd.read_csv('data_credit_card_customer_seg.csv')
df.head()

Unnamed: 0,CUST_ID,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,C10001,40.900749,0.818182,95.4,0.0,95.4,0.0,0.166667,0.0,0.083333,0.0,0,2,1000.0,201.802084,139.509787,0.0,12
1,C10002,3202.467416,0.909091,0.0,0.0,0.0,6442.945483,0.0,0.0,0.0,0.25,4,0,7000.0,4103.032597,1072.340217,0.222222,12
2,C10003,2495.148862,1.0,773.17,773.17,0.0,0.0,1.0,1.0,0.0,0.0,0,12,7500.0,622.066742,627.284787,0.0,12
3,C10004,1666.670542,0.636364,1499.0,1499.0,0.0,205.788017,0.083333,0.083333,0.0,0.083333,1,1,7500.0,0.0,,0.0,12
4,C10005,817.714335,1.0,16.0,16.0,0.0,0.0,0.083333,0.083333,0.0,0.0,0,1,1200.0,678.334763,244.791237,0.0,12


## Data cleaning

In [9]:
df.isnull().sum()

CUST_ID                               0
BALANCE                               0
BALANCE_FREQUENCY                     0
PURCHASES                             0
ONEOFF_PURCHASES                      0
INSTALLMENTS_PURCHASES                0
CASH_ADVANCE                          0
PURCHASES_FREQUENCY                   0
ONEOFF_PURCHASES_FREQUENCY            0
PURCHASES_INSTALLMENTS_FREQUENCY      0
CASH_ADVANCE_FREQUENCY                0
CASH_ADVANCE_TRX                      0
PURCHASES_TRX                         0
CREDIT_LIMIT                          1
PAYMENTS                              0
MINIMUM_PAYMENTS                    313
PRC_FULL_PAYMENT                      0
TENURE                                0
dtype: int64

In [12]:
df['CREDIT_LIMIT'].fillna(df['CREDIT_LIMIT'].mean(), inplace=True)
df['MINIMUM_PAYMENTS'].fillna(df['MINIMUM_PAYMENTS'].mean(), inplace=True)

CREDIT_LIMIT and MINIMUM_PAYMENT content some null values. We will handling the missing values by replace it by means.

In [13]:
df.isnull().sum().sum()

0

In [14]:
# drop unnecessary columns
df = df.drop(columns=['CUST_ID'],axis=1)

## Feature engineering

### 1. Types of Purchases (one-off, installments)
what type of purchases customers are making on credit card

In [15]:
df.loc[:,['ONEOFF_PURCHASES','INSTALLMENTS_PURCHASES']].head()

Unnamed: 0,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES
0,0.0,95.4
1,0.0,0.0
2,773.17,0.0
3,1499.0,0.0
4,16.0,0.0


In [16]:
def purchase(df):
    if (df['ONEOFF_PURCHASES'] == 0) & (df['INSTALLMENTS_PURCHASES'] == 0):
        return 'none'

    if (df['ONEOFF_PURCHASES'] > 0) & (df['INSTALLMENTS_PURCHASES'] > 0):
        return 'both'

    if (df['ONEOFF_PURCHASES'] > 0) & (df['INSTALLMENTS_PURCHASES'] == 0):
        return 'one_off'

    if (df['ONEOFF_PURCHASES'] == 0) & (df['INSTALLMENTS_PURCHASES'] > 0):
        return 'installment'


df['purchase_type'] = df.apply(purchase, axis=1)

df['purchase_type'].value_counts()

both           2774
installment    2260
none           2042
one_off        1874
Name: purchase_type, dtype: int64

### 2. Monthly average purchase

In [18]:
df['Monthly_avg_purch']=df['PURCHASES']/df['TENURE']

In [19]:
df.loc[:,['PURCHASES','TENURE','Monthly_avg_purch']].head()

Unnamed: 0,PURCHASES,TENURE,Monthly_avg_purch
0,95.4,12,7.95
1,0.0,12,0.0
2,773.17,12,64.430833
3,1499.0,12,124.916667
4,16.0,12,1.333333


### 3. Monthly cash advance amount

In [20]:
df['Monthly_cash_advance']=df['CASH_ADVANCE']/df['TENURE']
df['Monthly_cash_advance'].head()

0      0.000000
1    536.912124
2      0.000000
3     17.149001
4      0.000000
Name: Monthly_cash_advance, dtype: float64

### 4. Total purchase amount

In [29]:
df['TOTAL_PURCH_AMT'] = pd.DataFrame(data=df,columns=['PURCHASES','CASH_ADVANCE']).sum(axis=1)
df['TOTAL_PURCH_AMT'].head()

0      95.400000
1    6442.945483
2     773.170000
3    1704.788017
4      16.000000
Name: TOTAL_PURCH_AMT, dtype: float64

### 5. Total transaction

In [30]:
df['TOTAL_TRX'] = pd.DataFrame(data=df,columns=['PURCHASES_TRX','CASH_ADVANCE_TRX']).sum(axis=1)
df['TOTAL_TRX'].head()

0     2
1     4
2    12
3     2
4     1
Name: TOTAL_TRX, dtype: int64