Acquire

In [233]:
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns

from sklearn.model_selection import train_test_split

from pandas.core.window.ewm import ExponentialMovingWindow as emw

import wrangle
import warnings
warnings.filterwarnings("ignore")

In [234]:
X_df, y_df = wrangle.acquire_amex(sample_size=200000)
X_df = wrangle.clean_amex(X_df)
# wrangle.split_amex(X_df, y_df, train_size=.5, test_size=.5)

Summarize/Verify data

In [235]:
y_df.target.value_counts(normalize=True)

# print('Train: %d rows, %d cols' % y_train.shape)
# print('Validate: %d rows, %d cols' % y_validate.shape)
# print('Test: %d rows, %d cols' % y_validate.shape)

# print('Train: %d rows, %d cols' % X_train.shape)
# print('Validate: %d rows, %d cols' % X_validate.shape)
# print('Test: %d rows, %d cols' % X_test.shape)

# y_df.target.value_counts(normalize=True)

# y_train.target.value_counts(normalize=True)

0    0.741066
1    0.258934
Name: target, dtype: float64

Flatten the time series data. 

For each variable, we need to create the following:



In [556]:
# X_df.reset_index(drop=True).set_index(['S_2'])

sample = X_df[(X_df.customer_ID == X_df.customer_ID[0])|
              (X_df.customer_ID == X_df.customer_ID[15])][['customer_ID','P_2','R_1']]
sample

Unnamed: 0,customer_ID,P_2,R_1
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.938469,0.009228
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.936665,0.006151
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.95418,0.006815
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.960384,0.001373
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.947248,0.007605
5,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.945964,0.00422
6,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.940705,0.004509
7,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.914767,0.000263
8,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.950845,0.001789
9,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.86858,0.001772


In [501]:
def collapse_columns(df):
    df = df.copy()
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = df.columns.to_series().apply(lambda x: "_".join(x))
    return df

Slope

In [502]:
def get_slope(df):
    slope = df.groupby('customer_ID').agg(lambda x: np.polyfit(df.index, x, 1)[0])
    slope.columns = [x + '_slope' for x in slope.columns]
    return slope

Correlation Coefficient

In [503]:
def get_corr_coeff(df):
    r = df.groupby('customer_ID').agg(lambda x: np.corrcoef(df.index, x, rowvar=False)[0][1])
    r.columns = [x + '_r' for x in r.columns]
    return r

Count number of missing values in the set

In [504]:
def get_null_count(df):
    missing_vals = df.groupby('customer_ID').agg(lambda x: x.isnull().sum())
    missing_vals.columns = [x + '_nulls' for x in missing_vals.columns]
    return missing_vals

Compute number of 0 values in the set

In [505]:
def get_zeros(df):
    zeros = df.groupby('customer_ID').agg(lambda x: (x == 0.0).sum())
    zeros.columns = [x + '_zeros' for x in zeros.columns]
    return zeros

Compute coefficient of variation

In [506]:
def cv(df):
    cv = df.groupby('customer_ID').agg(lambda x: x.std()/x.mean())
    cv.columns = [x + '_cv' for x in cv.columns]
    return cv

Get the change in value over every 2 periods. Use that information to gather:

1. the current value (delta_value)
2. the number of values over time that are less than 0 (neg_delta_count)
3. the current 6 period moving average of the delta_values (delta_mean)

In [507]:
def difference(df):
    delta = df.groupby('customer_ID').diff(periods=2)
    delta.index = sample.customer_ID
    return delta

def delta_vals(df):
    # first compute the 2 period delta and create a dataframe with those values
    delta = difference(df)
    delta.columns = [x + '_diff' for x in delta.columns]
    
    # Use the delta df to take the last value as the current delta
    delta_value = delta.groupby('customer_ID').last()
    
    # use the delta df to count the number of changes over customer history that were negative
    neg_delta_count = delta.groupby('customer_ID').agg(lambda x: (x < 0).sum())
    neg_delta_count.columns = [x + '_count' for x in delta.columns]
    
    # use the delta df to compute the rolling average of the delta values
    delta_mean = delta.groupby('customer_ID').transform(lambda x: x.rolling(window=6, 
                                                                       min_periods=3, 
                                                                       closed='left').mean())
    delta_mean.columns = [x + '_mean' for x in delta.columns]
    
    # take the last value, the current average of change
    delta_mean = delta_mean.groupby('customer_ID').last()
    
    # concatenate the dataframes with the computed values by concatenating columns along the customer index
    delta_df = pd.concat([delta_value, neg_delta_count, delta_mean], axis=1)
    return delta_df

Compute the exponentially weighted moving average

In [508]:
def ema(df):
    ema = sample.groupby('customer_ID').transform(lambda x: x.ewm(alpha=.8,
                                                                  min_periods=1, 
                                                                  adjust=True).mean().shift(periods=1))
    ema.columns = [x + '_ema' for x in ema.columns]
    ema.index = sample.customer_ID
    ema = ema.groupby('customer_ID').last()
    return ema


Compute the rolling standard deviation

In [509]:
# rolling standard deviation
def rolling_std(df):
    std = df.groupby('customer_ID').transform(lambda x: x.rolling(window=12, min_periods=1, closed='left').mean())
    std.columns = [x + '_std' for x in std.columns]
    std.index = df.customer_ID
    std_df = std.groupby('customer_ID').last()
    return std_df

Compute rolling values. These will return a dataframe with a value for each row. I will need to take the last value. 

- exponentially weighted moving average (alpha = .8)
- rolling standard deviation (12 periods)
- Upper and lower bollinger bands
- %b
- bandwidth
- period over period difference

In [510]:
# compute upper and lower bands, with a weight of 3
k = 3

def bollinger(x):
    ubb = metrics_df[x + '_ema'] + k*metrics_df[x + '_std']
    lbb = metrics_df[(x + '_ema')] - k*metrics_df[x + '_std']
    pctb = (metrics_df[x + '_last'] - lbb) / (ubb - lbb)
    return pctb

In [552]:
def compute_pctb(df, x):
    # set the index in sample to customer_id
    sample_indexed = sample.set_index('customer_ID')
    
    # create an empty series
    pctb_series = pd.Series()

    # loop through original column names and for eacsh one, compute pctb
    for x in sample_indexed.columns:
        pctb = bollinger(x)
        pctb_series = pd.concat([pctb_series, pctb], axis=1)

    pctb_df = pd.DataFrame(pctb_series)
    pctb_df = pctb_df.iloc[:,1:]
    pctb_df.columns = [x + '_%b' for x in sample_indexed.columns]
    
    return pctb_df

In [553]:
def agg_features(df):
    sample_agg = df.groupby('customer_ID').agg(['last', 'min', 'max', 'median', 'count'])
    sample_df = collapse_columns(sample_agg)
    slope_df = get_slope(df)
    r_df = get_corr_coeff(df)
    missing_vals_df = get_null_count(df)
    zero_df = get_zeros(df)
    cv_df = cv(df)
    delta_df = delta_vals(df)
    ema_df = ema(df)
    std_df = rolling_std(df)
    metrics_df = pd.concat([sample_df, slope_df, r_df, missing_vals_df, 
                            zero_df, cv_df, delta_df, ema_df, std_df],axis=1)
    pctb_df = compute_pctb(df, x)
    metrics_df = pd.concat([pctb_df, metrics_df], axis=1)
    return metrics_df

In [557]:
metrics_df = agg_features(sample)

TypeError: can only concatenate str (not "float") to str

In [555]:
metrics_df

Unnamed: 0,P_2_%b,R_1_%b,P_2_last,P_2_min,P_2_max,P_2_median,P_2_count,R_1_last,R_1_min,R_1_max,...,P_2_diff,R_1_diff,P_2_diff_count,R_1_diff_count,P_2_diff_mean,R_1_diff_mean,P_2_ema,R_1_ema,P_2_std,R_1_std
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,0.500704,0.661504,0.934745,0.86858,0.960384,0.938469,13,0.006104,0.000263,0.009228,...,0.024933,-0.002072,6.0,7.0,-0.007675,-0.000507,0.930801,0.001864,0.933747,0.004376


Explore the different columns, datatypes, descriptive stats

For reference: 
* D_* = Delinquency variables
* S_* = Spend variables
* P_* = Payment variables
* B_* = Balance variables
* R_* = Risk variables

In [None]:
spend = X_df.iloc[:,X_df.columns.str[0] == 'S']
delinq = X_df.iloc[:,X_df.columns.str[0] == 'D']
pay = X_df.iloc[:,X_df.columns.str[0] == 'P']
balance = X_df.iloc[:,X_df.columns.str[0] == 'B']
risk = X_df.iloc[:,X_df.columns.str[0] == 'R']

**Spend variables**

- 22 total columns

- S_2: date *needs to be converted* **done**

- All others: float

- S_2, S_5, S_6, S_8, S_11:S_13, S_15:S_20 : no missing values

- S_22:S_26 : missing < 1% of values

- S_3, S_7, S_27 : missing 1-25% of values

- S_9, S_27 : missing 25-75% of values

In [None]:
spend.info()

**Delinquency Variables**

- 96 total columns

- D_63: Object

- D_64: Object

- All others: float

- D_39, D_47, D_51, D_58, D_60, D_63, D_65, D_71, D_75, D_86, D_92, D_93, D_94, D_96, D_127 : no missing values

- D_42, D_49, D_66, D_73, D_76, D_87, D_88, D_106, D_108, D_110, D_111, D_132, D_134:D_138, D_142 : missing > 75% of values.

- D_41, D_44:D_46, D_48, D_52, D_54:D_55, D_59, D_61, D_62, D_64, D_68:D_70, D_72, D_74, D_78:D_81, D_83, D_84, D_89, D_91, D_102:D_104, D_107, D_109, D_112:D_126, D_128:D_131, D_133, D_139:D_145: missing < 25%

- D_43, D_50, D_53 D_56, D_77, D_82, D_105 : 25-75% missing



In [None]:
delinq.D_63.value_counts()

In [None]:
delinq.D_64.value_counts()

In [None]:
delinq.info()

**Payment Variables**

- 3 total columns (P_2, P_3, P_4)

- all: float

- P_4 : no missing values

- P_2 & P_3 : missing < 1%

In [None]:
pay.info()

In [None]:
pay.describe()

**Balance Variables**

- 40 variables

- B_31: int (0, 1)

- all others: float

- B_29, B_39, and B_42 are majority null

- B_17 is missing 

- B_1, B_4, B_5, B_7, B_9, B_10, B_11, B_12, B_14, B_18, B_21, B_23, B_24, B_28, B_31, B_32, B_36 have no missing values. 

- B_2, B_3, B_6, B_8, B_13, B_15, B_16, B_19, B_20, B_25, B_26, B_27, B_30, B_33, B_37, B_38, B_40, B_41 are missing < 1% 


In [None]:
balance.B_31.value_counts()

In [None]:
balance.info()

In [None]:
balance.describe().T

**Risk Variables**

- 28 Columns

- All: float

- R_9, R_26: missing > 90% of values. 

- R_12, R_20, and R_27 are missing < 1%

- R_1:R_8, R_10:R_11, R13:R19, R21:R26, R28 :  no missing values

In [None]:
risk.info()

In [None]:
# generate lists of column names by datatype for future use in analysis
object_cols = ['D_63', 'D_64']
int_cols = ['B_31']
date_cols = ['S_2']

# list of non_float columns in order to generate a list of all float column names (186 columns)
non_float_cols = object_cols + int_cols + date_cols
float_cols = [col for col in X_df.columns if col not in non_float_cols]
len(float_cols)

In [None]:
with pd.option_context('display.max_rows', None,):
    print(null_df.sort_values('total_nulls'))

In [None]:
null_df.groupby('feature_category').percent_nulls.agg(['mean', 'median', 'max', 'min']).sort_values('mean', ascending=False)