Acquire

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats
import xgboost as xgb

from sklearn.model_selection import train_test_split

from pandas.core.window.ewm import ExponentialMovingWindow as emw

import wrangle
from wrangle import collapse_columns, get_null_count, get_zeros, get_delta_values, get_zeros_pct
from wrangle import get_negative_count, get_ema, get_pctb, get_range, get_cv, get_negative_pct

import warnings
warnings.filterwarnings("ignore")

Here we analyzed the feature importance based on the Gain metric from the best performing XGBoost model at this point. 
The goal is to identify the raw features that are providing the most information to the model as a way to reduce the number of features and thus the dimensionality. In doing that, we can focus on generating new features from the most influential and decrease performance time. We will take the top 20 features for a first run. 

In [4]:
for i,chunk in enumerate(pd.read_csv('../../data/raw/train_data.csv', chunksize=350000)):
    print(i)
    chunk.to_csv('../../data/chunked/train_data_chunk{}.csv'.format(i), index=False)

0


FileNotFoundError: [Errno 2] No such file or directory: '../../data/chunked/train_data_chunk0.csv'

In [2]:
# %%timeit -r 1 -n 1
# X_df_header = pd.read_csv('../../data/raw/train_data.csv', nrows=0)
X_df, y_df = wrangle.acquire_amex()

In [None]:
def prepare_cat_vars(X_df):
    '''
    this function will take the categorical variables and create a df with the dummy variables of those. 
    it returns that dataframe 
    '''
    # create dataframe of categorical columns only
    cat_columns = ['B_30', 'B_31', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
    X_df_cat = X_df[['customer_ID'] + cat_columns]

    # we will want to create dummy variables of categorical columns
    X_df_cat = pd.get_dummies(X_df_cat, columns=cat_columns, drop_first=True)
    return X_df_cat, cat_columns

In [None]:
X_df_cat, cat_columns = prepare_cat_vars(X_df)

Create new features out of numeric variables

In [None]:
non_numeric_cols = ['S_2'] + cat_columns
num_columns = [col for col in X_df.columns if col not in non_numeric_cols]
X_df_num = X_df[num_columns]

In [None]:
missing_vals_df = get_null_count(X_df_num[['customer_ID'] + list(X_df_num.isnull().sum()[X_df_num.isnull().sum()>0].index)])
missing_vals_df

Take care of all nulls and outliers in the numeric columns:

1. for values < -1, set value to -1. 
2. for values > 10, set value to 10. 
3. for variables where the min is > 0, set nulls to 0. 
4. for variables where the min is <= 0, set nulls to -2. 

In [None]:
for col in X_df_num.drop(columns=['customer_ID']).columns:
    # for values < -1, set to -1
    X_df_num.loc[X_df_num[col] < -1, col] = -1
    # for values > 10, set to 10
    X_df_num.loc[X_df_num[col] > 10, col] = 10

# create a descriptive stats dataframe I will reference in the next step
df_stats = X_df_num.drop(columns=['customer_ID']).describe().T

# find the name of variables where the min is > 0
fill_with_zero = df_stats[df_stats['min'] > 0].index
# find the name of variables where the min is <= 0
fill_with_neg2 = df_stats[df_stats['min'] <= 0].index

# for those vars where min > 0, set nulls to 0
for col in fill_with_zero:
    X_df_num.loc[X_df_num[col].isnull(), col] = 0

# for those vars where min <= 0, set nulls to -2 (the min will not be < -1)
for col in fill_with_neg2:
    X_df_num.loc[X_df_num[col].isnull(), col] = -2

So X_df_cat, X_df_num, and missing_vals_df are all free of nulls now. 

Next, get number of negative values for each column, as that seems to be an indicator for many variables. 

In [None]:
# create dataframe with the number of records with a negative value for each variable for each customer. 
neg_df = get_negative_count(X_df_num)

In [None]:
# create dataframe with the number of records of 0 value of each variable for each customer. 
zero_df = get_zeros(X_df_num)

In [None]:
# create a dataframe with the percent of records that have a value of 0 for each variable for each customer. 
zero_pct_df = get_zeros_pct(X_df_num)

In [None]:
# create a dataframe with the percent of records that have a negative value for each variable for each customer. 
neg_pct_df = get_negative_pct(X_df_num)

Now, I will concatenate the X_df_cat and X_df_num so that we can then aggregate by grouping by customer_ID. 

In [None]:
X_df = pd.concat([X_df_cat, X_df_num.drop(columns=['customer_ID'])], axis=1)

In [None]:
X_df.head()

In [None]:
# create dataframe with the last value, standard deviation, min and max of each variable for each customer. 
agg_df = X_df.groupby('customer_ID').agg(['last', 'median', 'mean', 'std', 'min', 'max'])
agg_df = collapse_columns(agg_df)

In [None]:
# create dataframe with the difference between the last value and the value 2 months prior for each variable for each customer. 
delta_df = get_delta_values(X_df)

In [None]:
# create dataframe with the exponential moving average, with a weight of .8, for each variable for each customer. 
ema_df = get_ema(X_df)

In [None]:
# aggregate all of the above dataframes into a single 'metrics' dataframe
metrics_df = pd.concat([agg_df, missing_vals_df, zero_df, zero_pct_df, neg_pct_df, neg_df, delta_df, ema_df],axis=1)

In [None]:
# create a new feature, pctb, which is the percent b value for each variable for each customer. 
metrics_df = get_pctb(X_df, metrics_df)

In [None]:
# create a new feature, range, which is the max - min for each variable for each customer. 
metrics_df = get_range(X_df, metrics_df)

In [None]:
# create a new feature, cv, which represents the coefficient of variation (std/mean) for each variable for each customer. 
metrics_df = get_cv(X_df, metrics_df)

In [None]:
# drop the _min and _std columns. The info for these is captured in _range and _cv
cols_to_drop = metrics_df.filter(regex='(_std)$', axis=1).columns
metrics_df = metrics_df.drop(columns=cols_to_drop)

In [None]:
# keep all columns for now
cols_to_keep = num_columns[1:] + list(X_df_cat.columns[1:])

In [None]:
for col in cols_to_keep:
    # where ema is null, replace it with the last value
    metrics_df.loc[metrics_df[col+'_ema'].isnull(), col+'_ema'] = metrics_df.loc[:,col+'_last']
    # where pct b is null, replace it with .5, the midpoint
    metrics_df.loc[metrics_df[col+'_pctb'].isnull(), col+'_pctb'] = .5
    # fill null cv records with 0
    metrics_df[col+'_cv'].fillna(value=0, inplace=True)
    # fill null values of difference with 0, this happens when there are not at least 2 months. 
    metrics_df[col+'_diff'].fillna(value=0, inplace=True)
    # fill null diff_mean values with 0
    metrics_df[col+'_diff_mean'].fillna(value=0, inplace=True)

In [None]:
# drop those columns where > 90% of rows are missing values
# missing_counts_df = pd.DataFrame({'missing_count': metrics_df.isnull().sum(), 'missing_pct': metrics_df.isnull().sum()/len(metrics_df)})

In [None]:
# cols_to_drop = missing_counts_df[missing_counts_df.missing_pct > .90].index
# features_df = metrics_df.drop(columns=cols_to_drop)

Missing values that are returned after the creation of new features are due to the following:

1. _diff, _ema, _cv, _%b when the customer only has one month of data. 
2. _diff_mean when the customer only has two months of data. 

I will replace missing values with the following: 

1. Fill cv with 0
2. Fill %b with .5 because that is the value when the last value is equal to the mean. 
3. Fill diff with 0
4. Fill diff_mean with 0
5. Fill ema with last value

In [None]:
metrics_df.head()

In [None]:
# missing values complete!
metrics_df.isnull().sum().sum()

In [None]:
metrics_df = metrics_df.reset_index()

In [None]:
metrics_df = metrics_df.rename(columns={'index': 'customer_ID'})

In [None]:
X_train, y_train, X_validate, y_validate, X_test, y_test = wrangle.split_amex(metrics_df, 
                                                                              y_df, 
                                                                              train_size=.5, 
                                                                              test_size=.5)

In [None]:
train = X_train.merge(y_train, how='left', on='customer_ID')
validate = X_validate.merge(y_validate, how='left', on='customer_ID')
test = X_test.merge(y_test, how='left', on='customer_ID')

In [None]:
for col in train.columns:
    print(col)
    print(train[col].describe()[-1])

In [None]:
for col in train.drop(columns=['customer_ID']).columns:
    # fill inf values of _cv 
    train.loc[np.isinf(np.array(train[col])), col] = 0
    validate.loc[np.isinf(np.array(validate[col])), col] = 0
    test.loc[np.isinf(np.array(test[col])), col] = 0

In [None]:
train_matrix = xgb.DMatrix(train.drop(columns=['customer_ID', 'target']), label=train.target)
valid_matrix = xgb.DMatrix(validate.drop(columns=['customer_ID', 'target']), label=validate.target)

In [None]:
steps = 1000
seed = 42

params = {
    'verbosity': 1,
    'max_depth': 3,
    'objective': 'binary:logistic',
    'eta': 0.075,
    'random_state': seed,
    'colsample_bytree': 0.8,
    'colsample_bylevel': 0.8, 
    'subsample': 0.8
}

In [None]:
model = xgb.train(params, train_matrix, steps, early_stopping_rounds=10,
                  evals=[(train_matrix, 'Train'), (valid_matrix, 'Valid')])

Run line by line, testing

In [None]:
X_df

In [None]:
features_df.to_csv('features_2.csv')

Flatten the time series data. 

For each variable, we need to create the following:



Explore the different columns, datatypes, descriptive stats

For reference: 
* D_* = Delinquency variables
* S_* = Spend variables
* P_* = Payment variables
* B_* = Balance variables
* R_* = Risk variables

In [None]:
spend = X_df.iloc[:,X_df.columns.str[0] == 'S']
delinq = X_df.iloc[:,X_df.columns.str[0] == 'D']
pay = X_df.iloc[:,X_df.columns.str[0] == 'P']
balance = X_df.iloc[:,X_df.columns.str[0] == 'B']
risk = X_df.iloc[:,X_df.columns.str[0] == 'R']

**Spend variables**

- 22 total columns

- S_2: date *needs to be converted* **done**

- All others: float

- S_2, S_5, S_6, S_8, S_11:S_13, S_15:S_20 : no missing values

- S_22:S_26 : missing < 1% of values

- S_3, S_7, S_27 : missing 1-25% of values

- S_9, S_27 : missing 25-75% of values

In [None]:
spend.info()

**Delinquency Variables**

- 96 total columns

- D_63: Object

- D_64: Object

- All others: float

- D_39, D_47, D_51, D_58, D_60, D_63, D_65, D_71, D_75, D_86, D_92, D_93, D_94, D_96, D_127 : no missing values

- D_42, D_49, D_66, D_73, D_76, D_87, D_88, D_106, D_108, D_110, D_111, D_132, D_134:D_138, D_142 : missing > 75% of values.

- D_41, D_44:D_46, D_48, D_52, D_54:D_55, D_59, D_61, D_62, D_64, D_68:D_70, D_72, D_74, D_78:D_81, D_83, D_84, D_89, D_91, D_102:D_104, D_107, D_109, D_112:D_126, D_128:D_131, D_133, D_139:D_145: missing < 25%

- D_43, D_50, D_53 D_56, D_77, D_82, D_105 : 25-75% missing



In [None]:
delinq.D_63.value_counts()

In [None]:
delinq.D_64.value_counts()

In [None]:
delinq.info()

**Payment Variables**

- 3 total columns (P_2, P_3, P_4)

- all: float

- P_4 : no missing values

- P_2 & P_3 : missing < 1%

In [None]:
pay.info()

In [None]:
pay.describe()

**Balance Variables**

- 40 variables

- B_31: int (0, 1)

- all others: float

- B_29, B_39, and B_42 are majority null

- B_17 is missing 

- B_1, B_4, B_5, B_7, B_9, B_10, B_11, B_12, B_14, B_18, B_21, B_23, B_24, B_28, B_31, B_32, B_36 have no missing values. 

- B_2, B_3, B_6, B_8, B_13, B_15, B_16, B_19, B_20, B_25, B_26, B_27, B_30, B_33, B_37, B_38, B_40, B_41 are missing < 1% 


In [None]:
balance.B_31.value_counts()

In [None]:
balance.info()

In [None]:
balance.describe().T

**Risk Variables**

- 28 Columns

- All: float

- R_9, R_26: missing > 90% of values. 

- R_12, R_20, and R_27 are missing < 1%

- R_1:R_8, R_10:R_11, R13:R19, R21:R26, R28 :  no missing values

In [None]:
risk.info()

In [None]:
# generate lists of column names by datatype for future use in analysis
object_cols = ['D_63', 'D_64']
int_cols = ['B_31']
date_cols = ['S_2']

# list of non_float columns in order to generate a list of all float column names (186 columns)
non_float_cols = object_cols + int_cols + date_cols
float_cols = [col for col in X_df.columns if col not in non_float_cols]
len(float_cols)

In [None]:
with pd.option_context('display.max_rows', None,):
    print(null_df.sort_values('total_nulls'))

In [None]:
null_df.groupby('feature_category').percent_nulls.agg(['mean', 'median', 'max', 'min']).sort_values('mean', ascending=False)

In [None]:
y_df.target.value_counts(normalize=True)