Acquire

In [141]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats

from sklearn.model_selection import train_test_split

from pandas.core.window.ewm import ExponentialMovingWindow as emw

import wrangle
import warnings
warnings.filterwarnings("ignore")

In [142]:
# %%timeit -r 1 -n 1

X_df, y_df = wrangle.acquire_amex(sample_size=199990)
X_df = wrangle.clean_amex(X_df).drop(columns=['S_2'])
# wrangle.split_amex(X_df, y_df, train_size=.5, test_size=.5)

In [143]:
#################################
# Feature Engineering functions #
#################################

def collapse_columns(X_df):
    '''
    this function will collapse the multi-level index of the columns 
    that are generated after computing the first set of aggregates in 
    our groupby function in the agg_features function.
    '''
    # df = X_df.copy()
    if isinstance(X_df.columns, pd.MultiIndex):
        X_df.columns = X_df.columns.to_series().apply(lambda x: "_".join(x))
    return X_df

def get_null_count(X_df):
    '''
    this function will calculate the number of missing values for each feature. 
    it reaturns a dataframe with the columns: <column_name_orig>_nulls 
    '''
    missing_vals = X_df.groupby('customer_ID').agg(lambda x: x.isnull().sum())
    missing_vals.columns = [x + '_nulls' for x in missing_vals.columns]
    return missing_vals

def get_zeros(X_df):
    '''
    this function will calculate the number of zeros values for each feature. 
    it reaturns a dataframe with the columns: <column_name_orig>_zeros 
    '''
    zeros_df = X_df.groupby('customer_ID').agg(lambda x: (x == 0.0).sum())
    zeros_df.columns = [x + '_zeros' for x in zeros_df.columns]
    return zeros_df

# def get_cv(X_df):
#     '''
#     this function will compute the coefficient of variation for each feature. 
#     it reaturns a dataframe with the columns: <column_name_orig>_cv 
#     '''
#     cv_df = X_df.groupby('customer_ID').agg(lambda x: x.std()/x.mean())
#     cv_df.columns = [x + '_cv' for x in cv_df.columns]
#     return cv_df

def get_two_period_difference(X_df):
    '''
    This function computes the 2-period in values for each feature. 
    it returns a dataframe with the customer id set to the index. 
    the function is used in compute_delta_values() function
    '''
    delta_df = X_df.groupby('customer_ID').diff(periods=2)
    delta_df.index = X_df.customer_ID
    return delta_df

    
def get_delta_values(X_df):
    '''
    This function first gets the two-period difference in values for each feature and assigns that to a dataframe (delta).
    It generates a dataframe of the most recent 2-period difference (delta_value).
    Next, from the delta dataframe, it computes the number of negative deltas over the customer's history and 
    assigns that to a dataframe (neg_delta_count).
    Next, it uses the delta dataframe to compute the average delta over the customer's history and assigns that to 
    a dadtaframe (delta_mean).
    Finally, all of these dataframes are concatenated into a single dataframe, delta_df. 
    '''
    # first compute the 2 period delta and create a dataframe with those values
    delta_df = get_two_period_difference(X_df)
    delta_df.columns = [x + '_diff' for x in delta_df.columns]
    
    # Use the delta df to take the last value as the current delta
    delta_value = delta_df.groupby('customer_ID').last()
    
    # use the delta df to count the number of changes over customer history that were negative
    neg_delta_count = delta_df.groupby('customer_ID').agg(lambda x: (x < 0).sum())
    neg_delta_count.columns = [x + '_count' for x in delta_df.columns]
    
    # use the delta df to compute the rolling average of the delta values
    delta_mean = delta_df.groupby('customer_ID').transform(lambda x: x.rolling(window=6, 
                                                                       min_periods=3, 
                                                                       closed='left').mean())
    delta_mean.columns = [x + '_mean' for x in delta_df.columns]
    
    # take the last value, the current average of change
    delta_mean = delta_mean.groupby('customer_ID').last()
    
    # concatenate the dataframes with the computed values by concatenating columns along the customer index
    delta_df = pd.concat([delta_value, neg_delta_count, delta_mean], axis=1)
    return delta_df

def get_ema(X_df):
    '''
    This function will compute the exponential moving average, with an alpha of .8. 
    it returns a dataframe with the columns: <column_name_orig>_ema. 
    '''
    ema_df = X_df.groupby('customer_ID').transform(lambda x: x.ewm(alpha=.8, min_periods=1, adjust=True).mean().shift(periods=1))
    ema_df.columns = [x + '_ema' for x in ema_df.columns]
    ema_df.index = X_df.customer_ID
    ema_df = ema_df.groupby('customer_ID').last()
    return ema_df

def get_pctb(X_df, metrics_df):
    df_customer_indexed = X_df.set_index('customer_ID')
    pctb_series = pd.Series()

    # loop through original column names and for eacsh one, compute pctb
    k = 6
    for x in df_customer_indexed.columns:
        ubb = metrics_df[x + '_ema'] + k*metrics_df[x + '_std']
        lbb = metrics_df[(x + '_ema')] - k*metrics_df[x + '_std']
        pctb = (metrics_df[x + '_last'] - lbb) / (ubb - lbb)
        pctb_series = pd.concat([pctb_series, pctb], axis=1)
    
    pctb_df = pd.DataFrame(pctb_series)
    pctb_df = pctb_df.iloc[:,1:]
    pctb_df.columns = [x + '_%b' for x in df_customer_indexed.columns]
    metrics_df = pd.concat([pctb_df, metrics_df], axis=1)
    return metrics_df

def get_range(X_df, metrics_df):
    range_series = pd.Series()
    for x in df_customer_indexed.columns:
        range_val = metrics_df[x + '_max'] - metrics_df[x + '_min']
        range_series = pd.concat([range_series, range_val], axis=1)

    range_df = pd.DataFrame(range_series)
    range_df = range_df.iloc[:,1:]
    range_df.columns = [x + '_%b' for x in df_customer_indexed.columns]
    metrics_df = pd.concat([range_df, metrics_df], axis=1)
    return metrics_df

def get_cv(X_df, metrics_df):
    cv_series = pd.Series()
    for x in df_customer_indexed.columns:
        cv = metrics_df[x + '_std']/metrics_df[x + '_ema']
        cv_series = pd.concat([cv_series, cv], axis=1)

    cv_df = pd.DataFrame(cv_series)
    cv_df = cv_df.iloc[:,1:]
    cv_df.columns = [x + '_cv' for x in df_customer_indexed.columns]
    metrics_df = pd.concat([cv_df, metrics_df], axis=1)
    return metrics_df

def ent(data):
    """Calculates entropy of the passed `pd.Series`
    """
    p_data = data.value_counts()           # counts occurrence of each value
    entropy = scipy.stats.entropy(p_data)  # get entropy from counts
    return entropy

def get_features(X_df):

    agg_df = X_df.groupby('customer_ID').agg(['last', 'std', 'min', 'max'])
    agg_df = collapse_columns(agg_df)

    missing_vals_df = get_null_count(X_df)
    zero_df = get_zeros(X_df)
    delta_df = get_delta_values(X_df)
    ema_df = get_ema(X_df)

    metrics_df = pd.concat([agg_df, missing_vals_df, zero_df, delta_df, ema_df],axis=1)

    metrics_df = get_pctb(X_df, metrics_df)
    metrics_df = get_range(X_df, metrics_df)
    metrics_df = get_cv(X_df, metrics_df)

    # drop the _min and _std columns. Those are captured in _range and _cv
    cols_to_drop = metrics_df.filter(regex='(_min|_std)$', axis=1).columns
    metrics_df = metrics_df.drop(columns=cols_to_drop)

    # drop those columns where > 90% of rows are missing values
    missing_counts_df = pd.DataFrame({'missing_count': metrics_df.isnull().sum(), 'missing_pct': metrics_df.isnull().sum()/len(metrics_df)})
    cols_to_drop = missing_counts_df[missing_counts_df.missing_pct > .90].index
    metrics_df = metrics_df.drop(columns=cols_to_drop)

    entropy_series = metrics_df.apply(ent)
    features_df = metrics_df[entropy_series[entropy_series > 1].index]
    return features_df



Run line by line, testing

In [145]:
X_df

Unnamed: 0,customer_ID,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,...,D_145,D_63_CO,D_63_CR,D_63_XL,D_63_XM,D_63_XZ,D_64_O,D_64_R,D_64_U,B_31_1
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.938469,0.001733,0.008724,1.006838,0.009228,0.124035,0.008771,0.004709,,...,0.002674,0,1,0,0,0,1,0,0,1
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.936665,0.005775,0.004923,1.000653,0.006151,0.126750,0.000798,0.002714,,...,0.009217,0,1,0,0,0,1,0,0,1
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.954180,0.091505,0.021655,1.009672,0.006815,0.123977,0.007598,0.009423,,...,0.002603,0,1,0,0,0,1,0,0,1
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.960384,0.002455,0.013683,1.002700,0.001373,0.117169,0.000685,0.005531,,...,0.009600,0,1,0,0,0,1,0,0,1
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.947248,0.002483,0.015193,1.000727,0.007605,0.117325,0.004653,0.009312,,...,0.009827,0,1,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199985,09572daa668d689b39d22d8a6a234d48141259b0675b0d...,0.837093,0.008891,0.010704,1.003218,0.009536,0.135677,0.000292,0.009491,,...,0.002958,1,0,0,0,0,1,0,0,1
199986,09572daa668d689b39d22d8a6a234d48141259b0675b0d...,0.836307,0.033900,0.037365,1.007822,0.006882,0.124006,0.000891,0.002805,,...,0.002722,1,0,0,0,0,1,0,0,1
199987,09572daa668d689b39d22d8a6a234d48141259b0675b0d...,0.878741,0.240031,0.044401,1.007089,0.000405,0.128138,0.001392,0.003904,,...,0.009201,1,0,0,0,0,1,0,0,1
199988,09572daa668d689b39d22d8a6a234d48141259b0675b0d...,0.836796,0.091514,0.013977,1.001713,0.007423,0.121567,0.008139,0.007592,,...,0.001935,1,0,0,0,0,1,0,0,1


In [146]:
features_df = get_features(X_df)

In [148]:
features_df.to_csv('features.csv')

Flatten the time series data. 

For each variable, we need to create the following:



Explore the different columns, datatypes, descriptive stats

For reference: 
* D_* = Delinquency variables
* S_* = Spend variables
* P_* = Payment variables
* B_* = Balance variables
* R_* = Risk variables

In [None]:
spend = X_df.iloc[:,X_df.columns.str[0] == 'S']
delinq = X_df.iloc[:,X_df.columns.str[0] == 'D']
pay = X_df.iloc[:,X_df.columns.str[0] == 'P']
balance = X_df.iloc[:,X_df.columns.str[0] == 'B']
risk = X_df.iloc[:,X_df.columns.str[0] == 'R']

**Spend variables**

- 22 total columns

- S_2: date *needs to be converted* **done**

- All others: float

- S_2, S_5, S_6, S_8, S_11:S_13, S_15:S_20 : no missing values

- S_22:S_26 : missing < 1% of values

- S_3, S_7, S_27 : missing 1-25% of values

- S_9, S_27 : missing 25-75% of values

In [None]:
spend.info()

**Delinquency Variables**

- 96 total columns

- D_63: Object

- D_64: Object

- All others: float

- D_39, D_47, D_51, D_58, D_60, D_63, D_65, D_71, D_75, D_86, D_92, D_93, D_94, D_96, D_127 : no missing values

- D_42, D_49, D_66, D_73, D_76, D_87, D_88, D_106, D_108, D_110, D_111, D_132, D_134:D_138, D_142 : missing > 75% of values.

- D_41, D_44:D_46, D_48, D_52, D_54:D_55, D_59, D_61, D_62, D_64, D_68:D_70, D_72, D_74, D_78:D_81, D_83, D_84, D_89, D_91, D_102:D_104, D_107, D_109, D_112:D_126, D_128:D_131, D_133, D_139:D_145: missing < 25%

- D_43, D_50, D_53 D_56, D_77, D_82, D_105 : 25-75% missing



In [None]:
delinq.D_63.value_counts()

In [None]:
delinq.D_64.value_counts()

In [None]:
delinq.info()

**Payment Variables**

- 3 total columns (P_2, P_3, P_4)

- all: float

- P_4 : no missing values

- P_2 & P_3 : missing < 1%

In [None]:
pay.info()

In [None]:
pay.describe()

**Balance Variables**

- 40 variables

- B_31: int (0, 1)

- all others: float

- B_29, B_39, and B_42 are majority null

- B_17 is missing 

- B_1, B_4, B_5, B_7, B_9, B_10, B_11, B_12, B_14, B_18, B_21, B_23, B_24, B_28, B_31, B_32, B_36 have no missing values. 

- B_2, B_3, B_6, B_8, B_13, B_15, B_16, B_19, B_20, B_25, B_26, B_27, B_30, B_33, B_37, B_38, B_40, B_41 are missing < 1% 


In [None]:
balance.B_31.value_counts()

In [None]:
balance.info()

In [None]:
balance.describe().T

**Risk Variables**

- 28 Columns

- All: float

- R_9, R_26: missing > 90% of values. 

- R_12, R_20, and R_27 are missing < 1%

- R_1:R_8, R_10:R_11, R13:R19, R21:R26, R28 :  no missing values

In [None]:
risk.info()

In [None]:
# generate lists of column names by datatype for future use in analysis
object_cols = ['D_63', 'D_64']
int_cols = ['B_31']
date_cols = ['S_2']

# list of non_float columns in order to generate a list of all float column names (186 columns)
non_float_cols = object_cols + int_cols + date_cols
float_cols = [col for col in X_df.columns if col not in non_float_cols]
len(float_cols)

In [None]:
with pd.option_context('display.max_rows', None,):
    print(null_df.sort_values('total_nulls'))

In [None]:
null_df.groupby('feature_category').percent_nulls.agg(['mean', 'median', 'max', 'min']).sort_values('mean', ascending=False)

In [None]:
y_df.target.value_counts(normalize=True)