# Python HANA ML API

<div class="alert alert-block alert-info">
<b>Building a Predictive Model for Insurance Fraud Detection.</b> <br>
</div>

## Learn from historical Insurance Claims

### Create an HANA Dataframe for the training data

In [13]:
# Connect using the HANA secure user store
from hana_ml import dataframe as hd
conn = hd.ConnectionContext(userkey='MLMDA_KEY')
# Get Training Data
sql_cmd = 'SELECT * FROM "APL_SAMPLES"."AUTO_CLAIMS_FRAUD" ORDER BY CLAIM_ID'
training_data = hd.DataFrame(conn, sql_cmd)

### Put a subset of the data in a Pandas Dataframe and display it

In [14]:
training_data.head(8).collect()

Unnamed: 0,CLAIM_ID,DAYS_TO_REPORT,BODILY_INJURY_AMOUNT,PROPERTY_DAMAGE,PREVIOUS_CLAIMS,PAYMENT_METHOD,IS_REAR_END_COLLISION,PREM_AMOUNT,AGE,GENDER,MARITAL_STATUS,INCOME_ESTIMATE,INCOME_CATEGORY,POLICY_HOLDER,IS_FRAUD
0,CL_0000765,8,0,1957,0,CC,No,Safedriving_discount,52,Male,Single,15906.0,15,Y,No
1,CL_0000832,30,2541,3843,0,CC,No,Safedriving_discount,85,Female,Single,91100.6,75,N,No
2,CL_0002015,4,0,25719,0,CC,No,Standard,45,Male,Married,67085.4,50,N,No
3,CL_0002854,0,0,83,1,Auto,No,Standard,75,Male,Married,49561.0,35,N,No
4,CL_0002869,22,0,1264,0,CC,Yes,Standard,48,Female,Married,39598.7,35,N,No
5,CL_0003400,3,9903,7333,0,Auto,No,Safedriving_discount,41,Male,Single,48271.8,35,N,Yes
6,CL_0005084,14,0,1882,0,CC,No,Safedriving_discount,26,Female,Single,76074.1,75,Y,No
7,CL_0005346,12,15399,8864,0,Auto,No,Standard,73,Male,Married,61438.2,50,N,No


### Build a Classification model with APL Gradient Boosting

In [15]:
# Create the model  
from hana_ml.algorithms.apl.gradient_boosting_classification import GradientBoostingBinaryClassifier
model = GradientBoostingBinaryClassifier(conn_context=conn)
# Train the model
model.fit(training_data, label='IS_FRAUD', key='CLAIM_ID')

##### Model Summary

In [16]:
summary_df = model.get_summary().collect()
df = summary_df[summary_df.KEY.isin(['ModelVariableCount', 'ModelSelectedVariableCount', 'ModelRecordCount',
                                     'ModelBuildDate'])].copy()
df['KEY'] = df['KEY'].str.replace('Model', '').str.replace('Selected', 'Selected ')
df['KEY'] = df['KEY'].str.replace('Count', ' Count').str.replace('Date', ' Date')
df = df[['KEY','VALUE']]
df.columns = ['Property', 'Value']
df.style.hide_index()

Property,Value
Variable Count,15
Selected Variable Count,13
Record Count,2000
Build Date,2020-01-08 08:16:42


##### Partitions

In [17]:
df = summary_df[(summary_df.KEY.str.match('ModelRecord')) & (summary_df.KEY!='ModelRecordCount') 
                 & (summary_df.VALUE!='0')].copy()
df['KEY'] = df['KEY'].str.replace('ModelRecord','').str.replace('Count','')
df['VALUE'] = df['VALUE'].astype(float)
df = df[['KEY','VALUE']]
df['In %'] = df.VALUE / df.VALUE.sum()
df.columns = ['Partition', 'Rows', 'In %']
df.style.format({'In %': '{:,.1%}'.format}).hide_index()

Partition,Rows,In %
Estimation,1448,72.4%
Validation,552,27.6%


##### Target Frequency

In [18]:
indicators_df = model.get_indicators().collect()
df = indicators_df[(indicators_df.KEY=='CategoryFrequency') & (indicators_df.VARIABLE=='IS_FRAUD')].copy()
df['VALUE'] = df['VALUE'].astype(float).round(4)
df = df[['VARIABLE','DETAIL','VALUE']]
df.columns = ['Target', 'Value', 'Frequency']
df.style.format({'Frequency': '{:,.2%}'.format}).hide_index()

Target,Value,Frequency
IS_FRAUD,No,88.33%
IS_FRAUD,Yes,11.67%


##### Descriptive Statistics

In [19]:
df = indicators_df[indicators_df.KEY.isin(['Min', 'Max', 'Mean', 'StandardDeviation'])].copy()
df['VALUE'] = df['VALUE'].astype(float).round(4)
df = df[['VARIABLE','KEY','VALUE']]
df.columns = ['Numeric Variable', 'Statistic','Value']
df.pivot(index='Numeric Variable',columns='Statistic',values='Value') 

Statistic,Max,Mean,Min,StandardDeviation
Numeric Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AGE,97.0,58.5318,18.0,22.5107
BODILY_INJURY_AMOUNT,29853.0,2268.8805,0.0,6303.6687
DAYS_TO_REPORT,30.0,13.5656,0.0,9.3085
INCOME_CATEGORY,75.0,,14.0,
INCOME_ESTIMATE,99619.9,49202.9862,110.284,28506.7089
PREVIOUS_CLAIMS,4.0,,0.0,
PROPERTY_DAMAGE,45062.0,5231.5573,28.0,5698.0787


##### Model Performance

In [20]:
import pandas as pd
d = model.get_performance_metrics()
df = pd.DataFrame(list(d.items()), columns=["Metric", "Value"])
df = df.loc[df['Metric'].isin(['AUC','BestIteration'])]
df.style.hide_index()

Metric,Value
AUC,0.9566
BestIteration,73.0


##### Variables Importance

In [21]:
d = model.get_feature_importances()['ExactSHAP']
df = pd.DataFrame(list(d.items()), columns=["Variable", "Contribution"])
df['Contribution'] = df['Contribution'].astype(float)
df['Cumulative'] = df['Contribution'].cumsum()
df['Contribution'] = df['Contribution'].round(4)*100
df['Cumulative'] = df['Cumulative'].round(4)*100
non_zero = df['Contribution'] != 0
dfa = df[non_zero].sort_values(by=['Contribution'], ascending=True)
dfd = df[non_zero].sort_values(by=['Contribution'], ascending=False)
import hvplot.pandas
dfa.hvplot.bar('Variable', 'Contribution', width=550, invert=True) +\
dfd.hvplot.table(list(df.columns), width=350, height=350, sortable=True)

## Make Predictions on New Claims

In [22]:
# Get New Claims
new_data = conn.table('AUTO_CLAIMS_NEW', schema='APL_SAMPLES')
# Apply the trained model
df = model.predict(new_data).collect()
df.columns = ['Claim Id', 'Actual', 'Prediction', 'Probability']
df.head(8).style.format({'Probability': '{:,.2%}'.format}).hide_index()

Claim Id,Actual,Prediction,Probability
CL_0959524,,No,77.82%
CL_0959946,,No,97.99%
CL_0960121,,No,81.14%
CL_0960195,,No,67.95%
CL_0960294,,Yes,51.30%
CL_0960379,,Yes,73.25%
CL_0960411,,No,94.98%
CL_0960946,,No,86.94%
