# FMTC2019 - Neural Networks - Variable Annuities

#### paper : https://arxiv.org/pdf/1606.07831

### Importing Packages 

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import keras 
import tensorflow
import theano
import sys
import itertools

from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder
from sklearn.metrics import confusion_matrix
from keras.models import Sequential
from keras.layers import Dense,Flatten
from keras.wrappers.scikit_learn import KerasRegressor
from keras.layers import Dropout
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression


Using TensorFlow backend.


### Importing Data

In [153]:
# We will use two databases selected by K-means.
# Dimension of sample1.csv = 340 observations (rep_contracts).
# Dimension of sample2.csv = 680 observations (train_contracts).
# sample2 will be our data that will be splitted up into train, test and validation data.
# sample1 will be our representative contracts.

# Importing:
rep_contracts = pd.read_csv("x2.csv")
train_contracts = pd.read_csv("x1.csv")

# First Cleaning
rep_contracts = rep_contracts.iloc[:,2:]
train_contracts = train_contracts.iloc[:,2:]

In [154]:
# Importing all contracts:
all_contracts = pd.read_csv("inforce2.csv")
# First Cleaning
all_contracts = all_contracts.iloc[:,2:]

In [155]:
aux = pd.get_dummies(train_contracts['productType'])
train_contracts = train_contracts.merge(aux,right_index=True,left_index=True).drop(['productType'],axis=1)
aux = pd.get_dummies(train_contracts['gender'])
train_contracts = train_contracts.merge(aux,right_index=True,left_index=True).drop(['gender'],axis=1)
train_contracts = train_contracts.drop(['F','WBSU'],axis=1)

In [156]:
aux = pd.get_dummies(all_contracts['productType'])
all_contracts = all_contracts.merge(aux,right_index=True,left_index=True).drop(['productType'],axis=1)
aux = pd.get_dummies(all_contracts['gender'])
all_contracts = all_contracts.merge(aux,right_index=True,left_index=True).drop(['gender'],axis=1)
all_contracts = all_contracts.drop(['F','WBSU'],axis=1)

In [157]:
all_contracts.head()

Unnamed: 0,survivorShip,issueDate,matDate,birthDate,currentDate,baseFee,riderFee,rollUpRate,gbAmt,gmwbBalance,...,DBWB,IBRP,IBRU,IBSU,MBRP,MBRU,MBSU,WBRP,WBRU,M
0,1,38596,48823,24504,41791,0.02,0.005,0.0,87657.368596,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1,41122,48427,20699,41791,0.02,0.005,0.0,161534.095807,0.0,...,0,0,0,0,0,0,0,0,0,1
2,1,41122,48427,22402,41791,0.02,0.005,0.0,407190.045231,0.0,...,0,0,0,0,0,0,0,0,0,1
3,1,38384,47150,26146,41791,0.02,0.005,0.0,307425.136759,0.0,...,0,0,0,0,0,0,0,0,0,1
4,1,36739,45870,18902,41791,0.02,0.005,0.0,356480.863596,0.0,...,0,0,0,0,0,0,0,0,0,0


In [166]:
train_contracts = train_contracts.replace([np.inf, -np.inf], np.nan)
train_contracts = train_contracts.dropna()
train_contracts.head()

Unnamed: 0,survivorShip,issueDate,matDate,birthDate,currentDate,baseFee,riderFee,rollUpRate,gbAmt,gmwbBalance,...,DBWB,IBRP,IBRU,IBSU,MBRP,MBRU,MBSU,WBRP,WBRU,M
0,1,36982,45017,22555,41791,0.02,0.005,0.0,273636.46379,0.0,...,0,0,0,0,0,0,0,0,0,1
1,1,40603,49735,20363,41791,0.02,0.005,0.0,436788.276659,0.0,...,0,0,0,0,0,0,0,0,0,1
2,1,36678,45078,26390,41791,0.02,0.005,0.0,393770.42971,0.0,...,0,0,0,0,0,0,0,0,0,1
3,1,39448,49310,21155,41791,0.02,0.005,0.0,195454.226448,0.0,...,0,0,0,0,0,0,0,0,0,1
4,1,37438,45474,19511,41791,0.02,0.005,0.0,302727.942538,0.0,...,0,0,0,0,0,0,0,0,0,1


In [167]:
train_contracts.shape

(340, 77)

### Auxiliary Functions

In [39]:
# Function that cleans Data:

def cleaningData(x):
    
    # Building Account Value (AV) variable 
    x['AV'] = x[['FundValue' + str(i) for i in np.arange(1, 11, 1)]].sum(axis=1)

    
    # Filtering only importants variables
    #x = x[['gender','productType','ttm','age', 'AV','gbAmt','withdrawal','wbWithdrawalRate','fmv']]
    
    # Selecting only two categories

    # TRASH:
    x.loc[:, 'productType'] = ["trash" if value in ['ABRP','ABRU','ABSU','IBRP','IBRU','IBSU','MBRP','MBRU','MBSU','DBAB','DBIB','DBMB','DBWB'] else value for value in list(x.productType)]

    # GMDB:
    x.loc[:, 'productType'] = ["GMDB" if value in ['DBRP','DBRU','DBSU'] else value for value in list(x.productType)]

    # GMWB:
    x.loc[:, 'productType'] = ["GMWB" if value in ['WBRP','WBRU','WBSU'] else value for value in list(x.productType)]
    
    # Building variables:
    x.withdrawal = x.withdrawal/x.AV
    x.gbAmt = x.gbAmt/x.AV
    
    # Categories into numbers:
    auxiliar_data = pd.DataFrame(x[::])
    auxiliar_data['male'] = [1 if value == 'M' else 0 for value in auxiliar_data.gender]
    auxiliar_data['GMDB'] = [1 if value == 'GMDB' else 0 for value in auxiliar_data.productType]
    auxiliar_data['GMWB'] = [1 if value == 'GMWB' else 0 for value in auxiliar_data.productType]

    # Dropping old category variables:
    auxiliar_data = auxiliar_data.drop(['gender', 'productType'], axis=1)

    # Dropping Acount Value == 0:
    auxiliar_data = auxiliar_data[auxiliar_data.AV != 0]
    
    y = pd.DataFrame(auxiliar_data['fmv'])
    x = auxiliar_data.drop(['fmv'], axis = 1)
    
    return [x,y]

In [40]:
# Standard Function:
def standardNow(df):
    m = []
    s = []
    dfaux = pd.DataFrame(np.array(df))
    for i in range(0,dfaux.shape[1]):
        mean = dfaux.iloc[:,i].mean()
        std = dfaux.iloc[:,i].std()
        dfaux.iloc[:,i] = (dfaux.iloc[:,i]-mean)/std
        m.append(mean)
        s.append(std)
    return [dfaux,m,s]

In [41]:
def RFR(x_train,y_train,x_test,y_test,NE,RE):
    # Train
    xtrain = np.array(standardNow(x_train)[0])
    ytrain = np.array(standardNow(y_train)[0])
    # Test:
    xtest = np.array(standardNow(x_test)[0])
    ytest = np.array(standardNow(y_test)[0])
    #initialize:
    rfr = RandomForestRegressor(n_estimators = NE, random_state = RE)
    rfr.fit(xtrain, ytrain)
    # Use the forest's predict method on the test data
    predictions = rfr.predict(xtest)
    predictions = np.array([predictions]).T
    # Calculate the absolute errors
    errors = abs(predictions - ytest)
    errors = round(np.mean(errors), 2)
    # Calculate mean absolute percentage error (MAPE)
    mape = 100 * (errors / ytest)
    return [errors,mape]

In [42]:
def extrapol(x_train,y_train,x_test,y_test,NE,RE,xall,yall):
    xall = np.array(xall)
    yall = np.array(yall)
    # Train
    xtrain = np.array(standardNow(x_train)[0])
    ytrain = np.array(standardNow(y_train)[0])
    # Test:
    xtest = np.array(standardNow(x_test)[0])
    ytest = np.array(standardNow(y_test)[0])
    # Metrics
    xmean = standardNow(xall)[1]
    xstd = standardNow(xall)[2]
    ymean = standardNow(y_train)[1]
    ystd = standardNow(y_train)[2]
    xall = (xall-xmean)/xstd
    #initialize:
    rfr = RandomForestRegressor(n_estimators = NE, random_state = RE)
    rfr.fit(xtrain, ytrain)
    # Use the forest's predict method on the test data
    predictions = rfr.predict(xall)
    predictions = np.array([predictions]).T
    predictions = (predictions*ystd)+ymean
    # Calculate the absolute errors
    errorsMAE = round(np.mean(abs(yall - predictions)), 2)
    errorsR2 = 1 - (sum(np.power((yall - predictions),2))/np.power(sum(yall-ymean),2))
    errorsPE = abs(yall - predictions)/yall
    #errors = round(np.mean(errors), 2)
    return [print('MAE: '+str(errorsMAE)), print('R2: '+str(errorsR2)),print('PE: '+str(errorsPE))]

### Data 

In [43]:
# Cleaning Data:
rep = cleaningData(rep_contracts)
train = cleaningData(train_contracts)
allC = cleaningData(all_contracts)

### Random Forest

In [168]:
# Cleaning Data:
# rep = cleaningData(rep_contracts)
# train = cleaningData(train_contracts)
# Splitting Data
x_train, x_test, y_train, y_test = train_test_split(train_contracts,train_contracts['fmv'],test_size=0.20,random_state=0)
# Saving feature names for later use
cols = list(train[0].columns)

In [169]:
x_train.shape

(272, 77)

In [171]:
x_train = x_train.dropna()

In [184]:
rfr = RandomForestRegressor(n_estimators = 1000, random_state = 0)
rfr.fit(x_train, y_train)
# Use the forest's predict method on the test data
predictions = rfr.predict(x_test)
predictions = np.array([predictions]).T
# Calculate the absolute errors
errors = abs(predictions - np.array(y_test))
errors = round(np.mean(errors), 2)

In [185]:
errors

141923.79

In [173]:
# Shapes:
print('y test shape: ' + str(y_test.shape))
print('y train shape: ' + str(y_train.shape))
print('x test shape: ' + str(x_test.shape))
print('x train shape: ' + str(x_train.shape))

y test shape: (68,)
y train shape: (272,)
x test shape: (68, 77)
x train shape: (272, 77)


In [174]:
RFR(x_train,y_train,x_test,y_test,10,0)[0]

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

### Extrapolation 

In [98]:
xall = allC[0]
yall = allC[1]

In [29]:
extrapol(xtrain,ytrain,xtest,ytest,10,0,xall,yall)

NameError: name 'xtrain' is not defined

In [30]:
yall

NameError: name 'yall' is not defined

In [164]:
# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: ttm                  Importance: 1.0
