Acquire

In [1]:
import pandas as pd

from scipy import stats

import seaborn as sns

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [2]:
X_df = pd.read_csv('../../data/raw/train_data.csv', nrows=200000)
y_df = pd.read_csv('../../data/raw/train_labels.csv')
y_df.target.value_counts(normalize=True)

In [63]:
# convert S_2 to datetime
X_df['S_2'] = pd.to_datetime(X_df.S_2)

# generate lists of column names by datatype for future use in analysis
object_cols = ['D_63', 'D_64']
int_cols = ['B_31']
date_cols = ['S_2']

# list of non_float columns in order to generate a list of all float column names (186 columns)
non_float_cols = object_cols + int_cols + date_cols
float_cols = [col for col in X_df.columns if col not in non_float_cols]
len(float_cols)

# For columns D_63, D_64 and B_31, we will want to create dummy variables. 
X_df = pd.get_dummies(X_df, columns=['D_63', 'D_64', 'B_31'], drop_first=True)

186

Flatten the time series data. 

For each variable, we need to create the following:



In [73]:
X_df[X_df.customer_ID == X_df.customer_ID[0]]


Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_145,D_63_CO,D_63_CR,D_63_XL,D_63_XM,D_63_XZ,D_64_O,D_64_R,D_64_U,B_31_1
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.938469,0.001733,0.008724,1.006838,0.009228,0.124035,0.008771,0.004709,...,0.002674,0,1,0,0,0,1,0,0,1
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.936665,0.005775,0.004923,1.000653,0.006151,0.12675,0.000798,0.002714,...,0.009217,0,1,0,0,0,1,0,0,1
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.95418,0.091505,0.021655,1.009672,0.006815,0.123977,0.007598,0.009423,...,0.002603,0,1,0,0,0,1,0,0,1
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,0.960384,0.002455,0.013683,1.0027,0.001373,0.117169,0.000685,0.005531,...,0.0096,0,1,0,0,0,1,0,0,1
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,0.947248,0.002483,0.015193,1.000727,0.007605,0.117325,0.004653,0.009312,...,0.009827,0,1,0,0,0,1,0,0,1
5,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-08-04,0.945964,0.001746,0.007863,1.005006,0.00422,0.110946,0.009857,0.009866,...,0.002884,0,1,0,0,0,1,0,0,1
6,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-09-18,0.940705,0.002183,0.018859,1.008024,0.004509,0.103329,0.006603,0.000783,...,0.002225,0,1,0,0,0,1,0,0,1
7,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-10-08,0.914767,0.003029,0.014324,1.000242,0.000263,0.108115,0.009527,0.007836,...,0.007385,0,1,0,0,0,1,0,0,1
8,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-11-20,0.950845,0.009896,0.016888,1.003995,0.001789,0.102792,0.002519,0.009817,...,0.000995,0,1,0,0,0,1,0,0,1
9,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-12-04,0.86858,0.001082,0.00193,1.007504,0.001772,0.10047,0.004626,0.006073,...,0.009068,0,1,0,0,0,1,0,0,1


Explore the different columns, datatypes, descriptive stats

For reference: 
* D_* = Delinquency variables
* S_* = Spend variables
* P_* = Payment variables
* B_* = Balance variables
* R_* = Risk variables

In [9]:
spend = X_df.iloc[:,X_df.columns.str[0] == 'S']
delinq = X_df.iloc[:,X_df.columns.str[0] == 'D']
pay = X_df.iloc[:,X_df.columns.str[0] == 'P']
balance = X_df.iloc[:,X_df.columns.str[0] == 'B']
risk = X_df.iloc[:,X_df.columns.str[0] == 'R']

**Spend variables**

- 22 total columns

- S_2: date *needs to be converted* **done**

- All others: float

- S_2, S_5, S_6, S_8, S_11:S_13, S_15:S_20 : no missing values

- S_22:S_26 : missing < 1% of values

- S_3, S_7, S_27 : missing 1-25% of values

- S_9, S_27 : missing 25-75% of values

In [12]:
spend.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 22 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   S_2     200000 non-null  object 
 1   S_3     162335 non-null  float64
 2   S_5     200000 non-null  float64
 3   S_6     200000 non-null  float64
 4   S_7     162335 non-null  float64
 5   S_8     200000 non-null  float64
 6   S_9     93402 non-null   float64
 7   S_11    200000 non-null  float64
 8   S_12    200000 non-null  float64
 9   S_13    200000 non-null  float64
 10  S_15    200000 non-null  float64
 11  S_16    200000 non-null  float64
 12  S_17    200000 non-null  float64
 13  S_18    200000 non-null  float64
 14  S_19    200000 non-null  float64
 15  S_20    200000 non-null  float64
 16  S_22    199313 non-null  float64
 17  S_23    199991 non-null  float64
 18  S_24    199330 non-null  float64
 19  S_25    199516 non-null  float64
 20  S_26    199966 non-null  float64
 21  S_27    14

**Delinquency Variables**

- 96 total columns

- D_63: Object

- D_64: Object

- All others: float

- D_39, D_47, D_51, D_58, D_60, D_63, D_65, D_71, D_75, D_86, D_92, D_93, D_94, D_96, D_127 : no missing values

- D_42, D_49, D_66, D_73, D_76, D_87, D_88, D_106, D_108, D_110, D_111, D_132, D_134:D_138, D_142 : missing > 75% of values.

- D_41, D_44:D_46, D_48, D_52, D_54:D_55, D_59, D_61, D_62, D_64, D_68:D_70, D_72, D_74, D_78:D_81, D_83, D_84, D_89, D_91, D_102:D_104, D_107, D_109, D_112:D_126, D_128:D_131, D_133, D_139:D_145: missing < 25%

- D_43, D_50, D_53 D_56, D_77, D_82, D_105 : 25-75% missing



In [31]:
delinq.D_63.value_counts()

CO    148773
CR     33751
CL     15991
XZ       925
XM       309
XL       251
Name: D_63, dtype: int64

In [32]:
delinq.D_64.value_counts()

O     105870
U      55177
R      29601
-1      1312
Name: D_64, dtype: int64

In [13]:
delinq.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 96 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   D_39    200000 non-null  float64
 1   D_41    199928 non-null  float64
 2   D_42    29193 non-null   float64
 3   D_43    139942 non-null  float64
 4   D_44    190071 non-null  float64
 5   D_45    199928 non-null  float64
 6   D_46    156105 non-null  float64
 7   D_47    200000 non-null  float64
 8   D_48    173989 non-null  float64
 9   D_49    20462 non-null   float64
 10  D_50    86464 non-null   float64
 11  D_51    200000 non-null  float64
 12  D_52    198991 non-null  float64
 13  D_53    52597 non-null   float64
 14  D_54    199928 non-null  float64
 15  D_55    193476 non-null  float64
 16  D_56    91119 non-null   float64
 17  D_58    200000 non-null  float64
 18  D_59    196233 non-null  float64
 19  D_60    200000 non-null  float64
 20  D_61    178557 non-null  float64
 21  D_62    17

**Payment Variables**

- 3 total columns (P_2, P_3, P_4)

- all: float

- P_4 : no missing values

- P_2 & P_3 : missing < 1%

In [16]:
pay.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   P_2     198458 non-null  float64
 1   P_3     189113 non-null  float64
 2   P_4     200000 non-null  float64
dtypes: float64(3)
memory usage: 4.6 MB


In [17]:
pay.describe()

Unnamed: 0,P_2,P_3,P_4
count,198458.0,189113.0,200000.0
mean,0.653527,0.600592,0.1466027
std,0.246135,0.171354,0.3409799
min,-0.383019,-1.055714,6.66793e-08
25%,0.476334,0.539831,0.002925073
50%,0.691541,0.618742,0.005853182
75%,0.863455,0.68402,0.008792215
max,1.009998,2.027742,1.198705


**Balance Variables**

- 40 variables

- B_31: int (0, 1)

- all others: float

- B_29, B_39, and B_42 are majority null

- B_17 is missing 

- B_1, B_4, B_5, B_7, B_9, B_10, B_11, B_12, B_14, B_18, B_21, B_23, B_24, B_28, B_31, B_32, B_36 have no missing values. 

- B_2, B_3, B_6, B_8, B_13, B_15, B_16, B_19, B_20, B_25, B_26, B_27, B_30, B_33, B_37, B_38, B_40, B_41 are missing < 1% 


In [33]:
balance.B_31.value_counts()

1    199407
0       593
Name: B_31, dtype: int64

In [18]:
balance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 40 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   B_1     200000 non-null  float64
 1   B_2     199928 non-null  float64
 2   B_3     199928 non-null  float64
 3   B_4     200000 non-null  float64
 4   B_5     200000 non-null  float64
 5   B_6     199993 non-null  float64
 6   B_7     200000 non-null  float64
 7   B_8     199152 non-null  float64
 8   B_9     200000 non-null  float64
 9   B_10    200000 non-null  float64
 10  B_11    200000 non-null  float64
 11  B_12    200000 non-null  float64
 12  B_13    198216 non-null  float64
 13  B_14    200000 non-null  float64
 14  B_15    199794 non-null  float64
 15  B_16    199928 non-null  float64
 16  B_17    87541 non-null   float64
 17  B_18    200000 non-null  float64
 18  B_19    199928 non-null  float64
 19  B_20    199928 non-null  float64
 20  B_21    200000 non-null  float64
 21  B_22    19

In [None]:
balance.describe().T

**Risk Variables**

- 28 Columns

- All: float

- R_9, R_26: missing > 90% of values. 

- R_12, R_20, and R_27 are missing < 1%

- R_1:R_8, R_10:R_11, R13:R19, R21:R26, R28 :  no missing values

In [21]:
risk.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 28 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   R_1     200000 non-null  float64
 1   R_2     200000 non-null  float64
 2   R_3     200000 non-null  float64
 3   R_4     200000 non-null  float64
 4   R_5     200000 non-null  float64
 5   R_6     200000 non-null  float64
 6   R_7     200000 non-null  float64
 7   R_8     200000 non-null  float64
 8   R_9     11895 non-null   float64
 9   R_10    200000 non-null  float64
 10  R_11    200000 non-null  float64
 11  R_12    199998 non-null  float64
 12  R_13    200000 non-null  float64
 13  R_14    200000 non-null  float64
 14  R_15    200000 non-null  float64
 15  R_16    200000 non-null  float64
 16  R_17    200000 non-null  float64
 17  R_18    200000 non-null  float64
 18  R_19    200000 non-null  float64
 19  R_20    199994 non-null  float64
 20  R_21    200000 non-null  float64
 21  R_22    20

In [None]:
with pd.option_context('display.max_rows', None,):
    print(null_df.sort_values('total_nulls'))

In [None]:
null_df.groupby('feature_category').percent_nulls.agg(['mean', 'median', 'max', 'min']).sort_values('mean', ascending=False)

In [None]:
delinq = X_df.iloc[:,X_df.columns.str[0] == 'D']
pay = X_df.iloc[:,X_df.columns.str[0] == 'P']
balance = X_df.iloc[:,X_df.columns.str[0] == 'B']
risk = X_df.iloc[:,X_df.columns.str[0] == 'R']

In [None]:
X_df.head()

In [None]:
y_train, y_validate_test = train_test_split(y_df, train_size=0.20, random_state=13)
y_validate, y_test = train_test_split(y_validate_test, test_size=0.49, random_state=13)

print('Train: %d rows, %d cols' % y_train.shape)
print('Validate: %d rows, %d cols' % y_validate.shape)
print('Test: %d rows, %d cols' % y_validate.shape)

In [None]:
X_train = X_df[X_df.customer_ID.isin(y_train.customer_ID.unique())]
X_validate = X_df[X_df.customer_ID.isin(y_validate.customer_ID.unique())]
X_test = X_df[X_df.customer_ID.isin(y_test.customer_ID.unique())]

print('Train: %d rows, %d cols' % X_train.shape)
print('Validate: %d rows, %d cols' % X_validate.shape)
print('Test: %d rows, %d cols' % X_test.shape)

In [None]:
y_df.target.value_counts(normalize=True)

In [None]:
y_train.target.value_counts(normalize=True)