# NUS Datathon Singlife
Submission by Team 72: **Nicholas Russell Saerang**

## Installation

In [1]:
#%pip install pandas
#%pip install pyarrow
#%pip install numpy
#%pip install scikit-learn
#%pip install imbalanced-learn
#%pip install matplotlib
#%pip install tensorflow

## Imports

In [2]:
import pandas as pd
import warnings

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

from collections import Counter
from imblearn.over_sampling import SMOTE
from datetime import datetime, timedelta
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix
from pandas.api.types import is_string_dtype

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import concatenate
from tensorflow.keras.utils import plot_model

## Load Data

In [3]:
filepath = os.path.join('data', 'catB_train.parquet')
df = pd.read_parquet(filepath)
df.shape

(17992, 304)

In [4]:
df.head()

Unnamed: 0,clntnum,race_desc,ctrycode_desc,clttype,stat_flag,min_occ_date,cltdob_fix,cltsex_fix,flg_substandard,flg_is_borderline_standard,...,recency_giclaim,giclaim_cnt_success,recency_giclaim_success,giclaim_cnt_unsuccess,recency_giclaim_unsuccess,flg_gi_claim_29d435_ever,flg_gi_claim_058815_ever,flg_gi_claim_42e115_ever,flg_gi_claim_856320_ever,f_purchase_lh
19550,91b546e924,Chinese,Singapore,P,ACTIVE,2017-10-31,1974-05-09,Female,0.0,0.0,...,,,,,,,,,,
4600,896bae548c,Chinese,Singapore,P,ACTIVE,2007-05-23,1979-11-11,Male,0.0,0.0,...,,,,,,,,,,
13337,f364439ae6,Others,Singapore,P,ACTIVE,2019-08-31,1976-01-28,Male,0.0,0.0,...,,,,,,,,,,
15074,70f319cfe1,Chinese,Singapore,P,ACTIVE,2021-10-18,1976-03-19,Female,0.0,0.0,...,,,,,,,,,,
19724,2647a81328,Chinese,Singapore,P,ACTIVE,2018-07-20,1995-07-31,Female,0.0,0.0,...,,,,,,,,,,


In [5]:
def minmax_encode(df, col):
    """
    Return dataset including the minmax encoded column and excluding the original column

    Constraints:
    - col must be a String
    - df must be a Pandas Dataframe
    - df[col] must be numeric
    """

    maxx = df[col].max()
    minx = df[col].min()
    out = list(map(lambda x: (x-minx)/(maxx-minx), df[col]))
    new_colname = col + "_minmax"
    df[new_colname] = out
    return df.drop(
        [col],
        axis = 1
    )

def one_hot_encode(df, col):
    """
    Returns the dataset including the one hot encoded columns and excluding the original column

    Constraints:
    - col must be a String
    - df must be a Pandas Dataframe
    - df[col] must be a Series that represents a categorical variable
    """
    ohe_cols = pd.get_dummies(df[col], prefix = col)
    output = pd.concat(
        [df, ohe_cols],
        axis = 1,
    ).drop(
        [col],
        axis = 1
    )
    return output

## Dataset Analysis

In [6]:
def preprocess(df, log=0):
    all_diff = []
    all_same = []

    # impute with 0
    for col in [
        'f_ever_declined_la', 'f_purchase_lh', 'tot_cancel_pols',
        'recency_lapse', 'recency_cancel', 'flg_affconnect_show_interest_ever',
        'flg_affconnect_ready_to_buy_ever', 'flg_hlthclaim_839f8a_ever',
        'flg_hlthclaim_14cb37_ever', 'affcon_visit_days',
        'clmcon_visit_days',
        'recency_clmcon_regis',
        'recency_hlthclaim',
        'hlthclaim_cnt_success',
        'recency_hlthclaim_success',
        'hlthclaim_cnt_unsuccess',
        'recency_hlthclaim_unsuccess',
        'recency_hlthclaim_839f8a',
        'recency_hlthclaim_14cb37',
        'recency_giclaim'
    ]:
        if col in df.columns: # to deal with testing hidden data so f_purchase_lh gets bypassed in already dropped
            df[col] = df[col].fillna(0)

    # impute with 0 + type conversion
    for col in df.columns:
        if 'lapse' in col or 'n_months' in col:
            df[col] = df[col].fillna(0).apply(int)
        elif 'prempaid' in col or 'sumins' in col or col.startswith('ape'):
            df[col] = df[col].fillna(0).apply(float)


    # impute with median
    for col in [
        'n_months_since_visit_affcon', 'recency_clmcon',
    ]:
        df[col] = df[col].fillna(df[col].median())

    # impute with mean
    for col in [
        'hlthclaim_amt', 'giclaim_amt'
    ]:
        df[col] = df[col].fillna(df[col].mean())
        df[col] = df[col].apply(float)

    # impute with 1
    df['is_dependent_in_at_least_1_policy'] = df['is_dependent_in_at_least_1_policy'].fillna(1)

    # impute with Others
    df['race_desc'] = df['race_desc'].fillna('Others')

    # type conversion + filter all diff columns + all same columns
    numerical_columns = {
        'hh_20': int,
        'pop_20': int,
        'hh_size': float,
    }
    date_columns = ['min_occ_date', 'cltdob_fix']
    ok_cols = []
    for col in df.columns:
        if len(df[col].unique()) == 1:
            all_same.append(col)
        elif len(df[col].unique()) == df.shape[0]:
            all_diff.append(col)
        elif col in numerical_columns:
            df = df.dropna(subset=[col])
            df[col] = df[col].apply(numerical_columns[col])
            ok_cols.append(col)
        elif col in date_columns:
            pass
        else:
            df = df.dropna(subset=[col])
            ok_cols.append(col)

    # Take all the OK columns
    df = df[ok_cols]
    if log: print('Before:', df.shape)

    for col in df.columns:
        if is_string_dtype(df[col]):
            df = one_hot_encode(df, col)
        elif col != 'f_purchase_lh':
            df = minmax_encode(df, col)

    if log: print('After:', df.shape)

    return df

In [7]:
df = preprocess(df, log=1)

Before: (14667, 238)
After: (14667, 256)


In [8]:
y = df['f_purchase_lh']
train_data = df.drop(['f_purchase_lh'], axis=1)

In [9]:
Counter(y)

Counter({0.0: 13986, 1.0: 681})

In [10]:
# Deal with imbalance before training
train_data, y = SMOTE().fit_resample(train_data, y)

In [11]:
Counter(y)

Counter({0.0: 13986, 1.0: 13986})

In [12]:
X_train, X_valid, y_train, y_valid = train_test_split(train_data, y, test_size=0.2, random_state=42)

In [13]:
Counter(y_train), Counter(y_valid)

(Counter({1.0: 11191, 0.0: 11186}), Counter({0.0: 2800, 1.0: 2795}))

## Neural Network
Because I only have an hour to finish this so feature engineering + further feature extraction is automatically done here :')

In [14]:
def create_mlp(dim):
	np.random.seed(42)
	model = Sequential()
	model.add(Dense(256, input_dim=dim, activation="relu"))
	model.add(Dropout(0.1))
	model.add(Dense(512, activation="relu"))
	model.add(Dropout(0.1))
	model.add(Dense(1024, activation="relu"))
	model.add(Dropout(0.1))
	model.add(Dense(512, activation="relu"))
	return model

In [15]:
mlp = create_mlp(X_train.shape[1])
mlp.add(Dense(256, activation="relu"))
mlp.add(Dense(512, activation="relu"))
mlp.add(Dense(1, activation="sigmoid"))
model = mlp

In [16]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print("[INFO] training model...")
history = model.fit(
	x=X_train, y=y_train,
	validation_data=(X_valid, y_valid),
	epochs=20, batch_size=512)

[INFO] training model...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [17]:
model.evaluate(X_train, y_train), model.evaluate(X_valid, y_valid)



([0.027342647314071655, 0.9904813170433044],
 [0.117123082280159, 0.970866858959198])

## The cell below is **NOT** to be removed
##### The function is to be amended so that it accepts the given input (dataframe) and returns the required output (list).
##### It is recommended to test the function out prior to submission
-------------------------------------------------------------------------------------------------------------------------------
##### The hidden_data parsed into the function below will have the same layout columns wise as the dataset *SENT* to you
##### Thus, ensure that steps taken to modify the initial dataset to fit into the model are also carried out in the function below

In [18]:
def testing_hidden_data(hidden_data: pd.DataFrame) -> list:
    '''DO NOT REMOVE THIS FUNCTION.

The function accepts a dataframe as input and return an iterable (list)
of binary classes as output.

The function should be coded to test on hidden data
and should include any preprocessing functions needed for your model to perform.

All relevant code MUST be included in this function.'''
    result = np.round(model.predict(preprocess(hidden_data))).ravel()
    return result

##### Cell to check testing_hidden_data function

In [19]:
# This cell should output a list of predictions.
test_df = pd.read_parquet(filepath)
test_df = test_df.drop(columns=["f_purchase_lh"])
print(testing_hidden_data(test_df))

[0. 0. 0. ... 0. 0. 0.]
