In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
%matplotlib inline

# Pre-processing

In [None]:
train_data_df = pd.read_csv('../data/train.csv')
test_data_df = pd.read_csv('../data/test.csv')

### Print DataFrame head

In [None]:
train_data_df.head()

In [None]:
test_data_df.head()

## Original data shape

In [None]:
train_data_df.shape

In [None]:
test_data_df.shape

## Column Names

In [None]:
train_data_df.columns

## Drop duplicates

In [None]:
train_data_df.drop_duplicates(inplace=True)
test_data_df.drop_duplicates(inplace=True)

## Replace missing values: `-1` with `np.nan`

In [None]:
train_data_df.replace(-1, np.nan, inplace=True)
test_data_df.replace(-1, np.nan, inplace=True)

## Initial count of missing values by column (or feature)

In [None]:
train_data_df.isnull().sum()

In [None]:
test_data_df.isnull().sum()

## Drop features in both training and testing sets

In [None]:
# Drop columns / features in `drop_labels` list
drop_labels = ['ps_reg_03', 'ps_car_03_cat', 'ps_car_05_cat']
train_data_df.drop(labels=drop_labels, axis=1, inplace=True)
test_data_df.drop(labels=drop_labels, axis=1, inplace=True)

## Visualize distribution of columns with missing values

In [None]:
train_data_df.ps_ind_05_cat.plot.hist(bins=20)
plt.show()

In [None]:
# IMPORTANT: Check missing labels in both training and test sets
missing_labels = ['ps_ind_02_cat', 
                  'ps_ind_04_cat', 
                  'ps_ind_05_cat',
                  'ps_car_01_cat',
                  'ps_car_02_cat',
                  'ps_car_07_cat',
                  'ps_car_09_cat',
                  'ps_car_11',
                  'ps_car_12', # only in training set
                  'ps_car_14'
                 ]

### Fill in missing values 

In [None]:
# Fill in missing values based on individual column(feature) distributions



### Final count of missing values by column (or feature)

In [None]:
# Confirm there are zero missing values in training set
train_data_df.isnull().sum()

In [None]:
# Confirm there are zero missing values in testing set
test_data_df.isnull().sum()

### Drop ID column for training and test set

In [None]:
count_target_ones = train_data_df.target.sum()
count_target_ones

In [None]:
100 * count_target_ones / train_shape[0]

In [None]:
categorical_vars = [var for var in train_data_df.columns if var.endswith('cat')]
binary_vars = [var for var in train_data_df.columns if var.endswith('bin')]
individual_vars =[var for var in train_data_df.columns if 'ind' in var]
car_vars =[var for var in train_data_df.columns if 'car' in var]
regional_vars = [var for var in train_data_df.columns if 'reg' in var]
calc_vars = [var for var in train_data_df.columns if 'calc' in var]

print('Total no. of features in training set: {}'.format(len(train_data_df.columns) - 2))
print('No. of categorical variables in training set: {}'.format(len(categorical_vars)))
print('No. of binary variables in training set: {}'.format(len(binary_vars)))
print('No. of individual variables in training set: {}'.format(len(individual_vars)))
print('No. of car variables in training set: {}'.format(len(car_vars)))
print('No. of regional variables in training set: {}'.format(len(regional_vars)))
print('No. of calculated variables in training set: {}'.format(len(calc_vars)))

In [None]:
train_data_df.info()

In [None]:
train_data_df.describe()

## PCA

### One hot encoding for categorical variables

### Feature scaling

## MODEL

In [None]:
# Binary classification - y_hat ={0;1}

### Set training targets and training data

In [None]:
# Transpose data to (no. of features x no. of observations)
train_y = None # Use 'target' column
train_x = None # Omit 'target' column
test_x = None # Must match columns used 

### Data Shapes

In [None]:
# Data previously transposed
train_x_shape = train_x.shape
m_train = train_shape[1]
test_x_shape = test_x.shape
m_test = test_shape[1]
train_y_shape = train_y.shape

assert(train_shape[0] == test_shape[0])

print('Training data shape: {} features x {} observations'.format(*train_shape))
print('Test data shape: {} features x {} observations'.format(*test_shape))
print('No of training examples: {}'.format(m_train))
print('No of test examples: {}'.format(m_test))

### Randomize training data

### Define the neural network structure

In [None]:
plt.legend()

### Initialize the model's parameters

### Fix Imbalanced dataset

# Tensorflow Implementation

### Hyperparameters

In [None]:
learning_rate = None
epochs = None
batch_size = None
keep_probability = None
n_hidden_layers = None
n_nodes = None
display_step = None

### Reset graph

In [2]:
tf.reset_default_graph()

### Define placeholders

In [3]:
def inputs(x_input, y_labels, learn_rate):
    x_input = tf.placeholder(dtype=tf.float32, 
                             shape=(None, train_x_shape[0], m_train), 
                             name='x_input'
                            )
    y_labels = tf.placeholder(dtype=tf.float32, 
                              shape=(None, 1, m_train), 
                              name='y_labels'
                             )
    learn_rate = tf.placeholder(dtype=tf.float32, 
                                shape=(1), 
                                name='learn_rate')
                              
    

### Activation Functions

In [None]:
# Use leaky relu for intermediate layers
def leaky_relu(z, alpha):
    '''
    Leaky ReLU activation function
    Inputs:
    z = result of wa + b
    alpha = slope
    '''
    a = max(alpha * z, z)
    return a

# Use sigmoid for binary classification
def sigmoid(z):
    '''
    Sigmoid activation
    '''
    a = tf.sigmoid(z)
    return a

### Model Architecture

### Model Predictions on Test Set

In [None]:
# Use saved model to generate predictions based on test data set


### Normalized Gini Coefficient Implementation

In [None]:
# Implement normalized Gini coefficient functions

# https://www.kaggle.com/c/ClaimPredictionChallenge/discussion/703
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    '''
    actual: ground truth target values
    pred: our model predictions based on test set
    '''
    assert(len(actual) == len(pred))
    all = np.asarray(np.c_[actual, pred, np.arange(len(actual))], dtype=np.float)
    all = all[np.lexsort((all[:, 2], -1 * all[:,1]))]
    total_losses = all[:, 0].sum()
    gini_sum = all[:, 0].cumsum().sum() / total_losses
 
    gini_sum -= (len(actual) + 1) / 2.0
    return gini_sum / len(actual)
 
def gini_normalized(actual, pred):
    return gini(actual, pred) / gini(actual, actual)
 

### Calculate Normalized Gini Coefficient

### Write model predictions to submission file

In [None]:
# Read sample submission csv file
sample_submission_data_df = pd.read_csv('../results/sample_submission.csv')

print("Sample submission file: {} x {}".format(*sample_submission_data_df.shape))

gini_outputs = gini_normalized(actual, pred)
print('Shape of normalized gini outputs: {}'. format(gini_outputs))

# Submission file has two columns: id, target

In [None]:
def test_gini():
    def fequ(a,b):
        return abs(a - b) < 1e-6
    def T(a, p, g, n):
        assert(fequ(gini(a,p), g) )
        assert(fequ(gini_normalized(a,p), n) )
    T([1, 2, 3], [10, 20, 30], 0.111111, 1)
    T([1, 2, 3], [30, 20, 10], -0.111111, -1)
    T([1, 2, 3], [0, 0, 0], -0.111111, -1)
    T([3, 2, 1], [0, 0, 0], 0.111111, 1)
    T([1, 2, 4, 3], [0, 0, 0, 0], -0.1, -0.8)
    T([2, 1, 4, 3], [0, 0, 2, 1], 0.125, 1)
    T([0, 20, 40, 0, 10], [40, 40, 10, 5, 5], 0, 0)
    T([40, 0, 20, 0, 10], [1000000, 40, 40, 5, 5], 0.171428,
       0.6)
    T([40, 20, 10, 0, 0], [40, 20, 10, 0, 0], 0.285714, 1)
    T([1, 1, 0, 1], [0.86, 0.26, 0.52, 0.32], -0.041666,
       -0.333333)