In [1]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.preprocessing import MinMaxScaler, Imputer
from sklearn.linear_model import LogisticRegression
import sys
sys.path.insert(0, '../scripts/')
import utils, pipeline_utils

IndentationError: unindent does not match any outer indentation level (pipeline_utils.py, line 86)

In [None]:
application_path = '../data/application_train.csv'
application_path_pkl = '../data/application.pkl'
bureau_path = '../data/bureau.csv'
bureau_path_pkl = '../data/bureau.pkl'
bureau_balance_path = '../data/bureau_balance.csv'
bureau_balance_path_pkl = '../data/bureau_balance.pkl'

In [None]:
application_data = load_data_csv(application_path)

In [None]:
application_data.to_pickle(application_path_pkl)

In [None]:
application_data = pd.read_pickle(application_path_pkl)

In [None]:
bureau_data = load_data_csv(bureau_path)

In [None]:
bureau_balance_data = load_data_csv(bureau_balance_path)

### Application Data Analysis

In [None]:
application_data.head()

In [None]:
application_data.columns

### Target

- 1 - Client with payment difficulties: he/she had late payment more than X days on at least one of the first Y installments of the loan in our sample (will have difficulty repaying loan)
- 0 - All other cases (will repay loan on time)

In [None]:
application_data.TARGET.value_counts()

In [None]:
application_data.shape

In [None]:
application_data.describe()

### Feature - Target Split

In [None]:
X, y = extract_X_y(application_data)

### Train - Test Split

In [None]:
X_train, X_val, y_train, y_val = splitting(X, y, val_split=0.2)

In [None]:
X_train.shape, y_train.shape

### Numerical - Categorical features

In [None]:
numeric_columns = X_train.select_dtypes(include='number').columns
categorical_columns = list(set(X_train.columns) - set(numeric_columns))

### Pipeline Implementation

In [None]:
x_cols = list(X_train.columns.values)

In [None]:
preprocess_pipeline = make_pipeline(
    ColumnSelector(columns=x_cols),
    FeatureUnion(transformer_list=[
        ("numeric_features", make_pipeline(
            TypeSelector(np.number),
            Imputer(strategy="median"),
            StandardScaler()
        )),
        ("categorical_features", make_pipeline(
            TypeSelector("object"),
            ToDummiesTransformer()
        ))
    ])
)

In [None]:
classifier_pipeline = make_pipeline(
    preprocess_pipeline,
    LogisticRegression(random_state=42)
)

In [None]:
classifier_pipeline.fit(X_train, y_train)

In [None]:
X_train.shape, y_train.shape

In [None]:
y_pred = classifier_pipeline.predict_proba(X_val)[:, 1]
roc_auc_score(y_val, y_pred)

In [None]:
application_data_test = pd.read_csv('../data/application_test.csv')
# y_pred_test = classifier_pipeline.predict_proba(application_data_test)[:, 1]

In [None]:
object_columns = X_train.columns[X_train.dtypes == object].tolist()
object_levels = np.union1d(X_train[object_columns].fillna('NAN'), application_data_test[object_columns].fillna('NAN'))

In [None]:
application_data_test.shape, X_train.shape

### Manual Feature Engineering

In [None]:
bureau_data.head()

### Create new names for each of these columns

In [None]:
def agg_numeric(df, group_var, df_name):
    """Aggregates the numeric values in a dataframe. This can
    be used to create features for each instance of the grouping variable.
    
    Parameters
    --------
        df (dataframe): 
            the dataframe to calculate the statistics on
        group_var (string): 
            the variable by which to group df
        df_name (string): 
            the variable used to rename the columns
        
    Return
    --------
        agg (dataframe): 
            a dataframe with the statistics aggregated for 
            all numeric columns. Each instance of the grouping variable will have 
            the statistics (mean, min, max, sum; currently supported) calculated. 
            The columns are also renamed to keep track of features created.
    
    """
    # Remove id variables other than grouping variable
    for col in df:
        if col != group_var and 'SK_ID' in col:
            df = df.drop(columns = col)
            
    group_ids = df[group_var]
    numeric_df = df.select_dtypes('number')
    numeric_df[group_var] = group_ids

    # Group by the specified variable and calculate the statistics
    agg = numeric_df.groupby(group_var).agg(['count', 'mean', 'max', 'min', 'sum']).reset_index()

    # Need to create new column names
    columns = [group_var]

    # Iterate through the variables names
    for var in agg.columns.levels[0]:
        # Skip the grouping variable
        if var != group_var:
            # Iterate through the stat names
            for stat in agg.columns.levels[1][:-1]:
                # Make a new column name for the variable and stat
                columns.append('%s_%s_%s' % (df_name, var, stat))

    agg.columns = columns
    return agg

### Function to Handle Categorical Variables

In [None]:
def count_categorical(df, group_var, df_name):
    """Computes counts and normalized counts for each observation
    of `group_var` of each unique category in every categorical variable
    
    Parameters
    --------
    df : dataframe 
        The dataframe to calculate the value counts for.
        
    group_var : string
        The variable by which to group the dataframe. For each unique
        value of this variable, the final dataframe will have one row
        
    df_name : string
        Variable added to the front of column names to keep track of columns

    
    Return
    --------
    categorical : dataframe
        A dataframe with counts and normalized counts of each unique category in every categorical variable
        with one row for every unique value of the `group_var`.
        
    """
    
    # Select the categorical columns
    categorical = pd.get_dummies(df.select_dtypes('object'))

    # Make sure to put the identifying id on the column
    categorical[group_var] = df[group_var]

    # Groupby the group var and calculate the sum and mean
    categorical = categorical.groupby(group_var).agg(['sum', 'mean'])
    
    column_names = []
    
    # Iterate through the columns in level 0
    for var in categorical.columns.levels[0]:
        # Iterate through the stats in level 1
        for stat in ['count', 'count_norm']:
            # Make a new column name
            column_names.append('%s_%s_%s' % (df_name, var, stat))
    
    categorical.columns = column_names
    
    return categorical

To recap, for the `bureau_balance` dataframe we:

1. Calculated numeric stats grouping by each loan
2. Made value counts of each categorical variable grouping by loan
3. Merged the stats and the value counts on the loans
4. Calculated numeric stats for the resulting dataframe grouping by the client id

The final resulting dataframe has one row for each client, with statistics calculated for all of their loans with monthly balance information. 

Some of these variables are a little confusing, so let's try to explain a few:

* `client_bureau_balance_MONTHS_BALANCE_mean_mean`: For each loan calculate the mean value of `MONTHS_BALANCE`. Then for each client, calculate the mean of this value for all of their loans. 
* `client_bureau_balance_STATUS_X_count_norm_sum`: For each loan, calculate the number of occurences of `STATUS` == X divided by the number of total `STATUS` values for the loan. Then, for each client, add up the values for each loan. 

## Putting the functions together

### Counts of Bureau Dataframe

In [None]:
bureau_counts = count_categorical(bureau_data, group_var = 'SK_ID_CURR', df_name = 'bureau')
bureau_counts.head()

### Aggregated Stats of Bureau Dataframe

In [None]:
bureau_agg = agg_numeric(bureau_data.drop(columns = ['SK_ID_BUREAU']), group_var = 'SK_ID_CURR', df_name = 'bureau')
bureau_agg.head()

### Value counts of Bureau Balance dataframe by loan

In [None]:
bureau_balance_counts = count_categorical(bureau_balance_data, group_var = 'SK_ID_BUREAU', df_name = 'bureau_balance')
bureau_balance_counts.head()

### Aggregated stats of Bureau Balance dataframe by loan

In [None]:
bureau_balance_agg = agg_numeric(bureau_balance_data, group_var = 'SK_ID_BUREAU', df_name = 'bureau_balance')
bureau_balance_agg.head()

### Aggregated Stats of Bureau Balance by Client

In [None]:
# Dataframe grouped by the loan
bureau_by_loan = bureau_balance_agg.merge(bureau_balance_counts, right_index = True, left_on = 'SK_ID_BUREAU', how = 'outer')

# Merge to include the SK_ID_CURR
bureau_by_loan = bureau_data[['SK_ID_BUREAU', 'SK_ID_CURR']].merge(bureau_by_loan, on = 'SK_ID_BUREAU', how = 'left')

# Aggregate the stats for each client
bureau_balance_by_client = agg_numeric(bureau_by_loan.drop(columns = ['SK_ID_BUREAU']), group_var = 'SK_ID_CURR', df_name = 'client')

## Insert Computed Features into Training Data

In [None]:
original_features = list(application_data.columns)
train = application_data
print('Original Number of Features: ', len(original_features))

In [None]:
# Merge with the value counts of bureau
train = train.merge(bureau_counts, on = 'SK_ID_CURR', how = 'left')

# Merge with the stats of bureau
train = train.merge(bureau_agg, on = 'SK_ID_CURR', how = 'left')

# Merge with the monthly information grouped by client
train = train.merge(bureau_balance_by_client, on = 'SK_ID_CURR', how = 'left')

In [None]:
columns_to_delete = []
columns_to_delete.append(application_data.columns.values[96:116])

In [None]:
application_data.loc['APARTMENTS_AVG':'EMERGENCYSTATE_MODE']