# FMTC2019 - Neural Networks - Variable Annuities

#### paper : https://arxiv.org/pdf/1606.07831

### Importing Packages 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import keras 
import tensorflow
import theano
import sys
import itertools

from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder
from sklearn.metrics import confusion_matrix
from keras.models import Sequential
from keras.layers import Dense,Flatten
from keras.wrappers.scikit_learn import KerasRegressor
from keras.layers import Dropout
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression


Using TensorFlow backend.


### Importing Data

In [77]:
# We will use two databases selected by K-means.
# Dimension of sample1.csv = 340 observations (rep_contracts).
# Dimension of sample2.csv = 680 observations (train_contracts).
# sample2 will be our data that will be splitted up into train, test and validation data.
# sample1 will be our representative contracts.

# Importing:
rep_contracts = pd.read_csv("sample1.csv")
train_contracts = pd.read_csv("sample2.csv")

# First Cleaning
rep_contracts = rep_contracts.iloc[:,2:]
train_contracts = train_contracts.iloc[:,2:]

In [78]:
# Importing all contracts:
all_contracts = pd.read_csv("inforce2.csv")
# First Cleaning
all_contracts = all_contracts.iloc[:,2:]

### Auxiliary Functions

In [79]:
# Function that cleans Data:

def cleaningData(x):
    
    # Building Account Value (AV) variable 
    x['AV'] = x[['FundValue' + str(i) for i in np.arange(1, 11, 1)]].sum(axis=1)

    
    # Filtering only importants variables
    x = x[['gender','productType','ttm','age', 'AV','gbAmt','withdrawal','wbWithdrawalRate','fmv']]
    
    # Selecting only two categories

    # TRASH:
    x.loc[:, 'productType'] = ["trash" if value in ['ABRP','ABRU','ABSU','IBRP','IBRU','IBSU','MBRP','MBRU','MBSU','DBAB','DBIB','DBMB','DBWB'] else value for value in list(x.productType)]

    # GMDB:
    x.loc[:, 'productType'] = ["GMDB" if value in ['DBRP','DBRU','DBSU'] else value for value in list(x.productType)]

    # GMWB:
    x.loc[:, 'productType'] = ["GMWB" if value in ['WBRP','WBRU','WBSU'] else value for value in list(x.productType)]
    
    # Building variables:
    x.withdrawal = x.withdrawal/x.AV
    x.gbAmt = x.gbAmt/x.AV
    
    # Categories into numbers:
    auxiliar_data = pd.DataFrame(x[::])
    auxiliar_data['male'] = [1 if value == 'M' else 0 for value in auxiliar_data.gender]
    auxiliar_data['GMDB'] = [1 if value == 'GMDB' else 0 for value in auxiliar_data.productType]
    auxiliar_data['GMWB'] = [1 if value == 'GMWB' else 0 for value in auxiliar_data.productType]

    # Dropping old category variables:
    auxiliar_data = auxiliar_data.drop(['gender', 'productType'], axis=1)

    # Dropping Acount Value == 0:
    auxiliar_data = auxiliar_data[auxiliar_data.AV != 0]
    
    y = pd.DataFrame(auxiliar_data['fmv'])
    x = auxiliar_data.drop(['fmv'], axis = 1)
    
    return [x,y]

In [80]:
# Standard Function:
def standardNow(df):
    m = []
    s = []
    dfaux = pd.DataFrame(np.array(df))
    for i in range(0,dfaux.shape[1]):
        mean = dfaux.iloc[:,i].mean()
        std = dfaux.iloc[:,i].std()
        dfaux.iloc[:,i] = (dfaux.iloc[:,i]-mean)/std
        m.append(mean)
        s.append(std)
    return [dfaux,m,s]

In [81]:
def RFR(x_train,y_train,x_test,y_test,NE,RE):
    # Train
    xtrain = np.array(standardNow(x_train)[0])
    ytrain = np.array(standardNow(y_train)[0])
    # Test:
    xtest = np.array(standardNow(x_test)[0])
    ytest = np.array(standardNow(y_test)[0])
    #initialize:
    rfr = RandomForestRegressor(n_estimators = NE, random_state = RE)
    rfr.fit(xtrain, ytrain)
    # Use the forest's predict method on the test data
    predictions = rfr.predict(xtest)
    predictions = np.array([predictions]).T
    # Calculate the absolute errors
    errors = abs(predictions - ytest)
    errors = round(np.mean(errors), 2)
    # Calculate mean absolute percentage error (MAPE)
    mape = 100 * (errors / ytest)
    return [errors,mape]

In [128]:
def extrapol(x_train,y_train,x_test,y_test,NE,RE,xall,yall):
    xall = np.array(xall)
    yall = np.array(yall)
    # Train
    xtrain = np.array(standardNow(x_train)[0])
    ytrain = np.array(standardNow(y_train)[0])
    # Test:
    xtest = np.array(standardNow(x_test)[0])
    ytest = np.array(standardNow(y_test)[0])
    # Metrics
    xmean = standardNow(xall)[1]
    xstd = standardNow(xall)[2]
    ymean = standardNow(y_train)[1]
    ystd = standardNow(y_train)[2]
    xall = (xall-xmean)/xstd
    #initialize:
    rfr = RandomForestRegressor(n_estimators = NE, random_state = RE)
    rfr.fit(xtrain, ytrain)
    # Use the forest's predict method on the test data
    predictions = rfr.predict(xall)
    predictions = np.array([predictions]).T
    predictions = (predictions*ystd)+ymean
    # Calculate the absolute errors
    errorsMAE = round(np.mean(abs(yall - predictions)), 2)
    errorsR2 = 1 - (sum(np.power((yall - predictions),2))/np.power(sum(yall-ymean),2))
    errorsPE = abs(yall - predictions)/yall
    #errors = round(np.mean(errors), 2)
    return [print('MAE: '+str(errorsMAE)), print('R2: '+str(errorsR2)),print('PE: '+str(errorsPE))]

In [118]:
a = np.array([1,2,3])
b = np.array([2,3,4])
a-b
np.power(a,2)


array([1, 4, 9])

### Data 

In [94]:
# Cleaning Data:
rep = cleaningData(rep_contracts)
train = cleaningData(train_contracts)
allC = cleaningData(all_contracts)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


### Random Forest

In [95]:
# Cleaning Data:
rep = cleaningData(rep_contracts)
train = cleaningData(train_contracts)
# Splitting Data
x_train, x_test, y_train, y_test = train_test_split(train[0],train[1],test_size=0.20,random_state=0)
# Saving feature names for later use
cols = list(train[0].columns)

In [96]:
# Shapes:
print('y test shape: ' + str(y_test.shape))
print('y train shape: ' + str(y_train.shape))
print('x test shape: ' + str(x_test.shape))
print('x train shape: ' + str(x_train.shape))

y test shape: (136, 1)
y train shape: (540, 1)
x test shape: (136, 9)
x train shape: (540, 9)


In [97]:
RFR(x_train,y_train,x_test,y_test,10,0)[0]

  # Remove the CWD from sys.path while we load stuff.


0.26

### Extrapolation 

In [98]:
xall = allC[0]
yall = allC[1]

In [129]:
extrapol(xtrain,ytrain,xtest,ytest,10,0,xall,yall)



MAE: 102018.19
R2: [0.9999783]
PE: [[ 0.99999526]
 [-1.00008616]
 [-1.000091  ]
 ...
 [ 0.99999951]
 [ 0.9999895 ]
 [ 1.00000262]]


[None, None, None]

In [104]:
yall

Unnamed: 0,fmv
0,16763.294834
1,-4803.217415
2,-36038.487254
3,45419.126711
4,97013.359907
5,-1736.302447
6,-4342.756002
7,81436.272913
8,6466.021499
9,-4604.842590


In [164]:
# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: ttm                  Importance: 1.0
