## Урок 7. Кейс 2. Типы моделей для задачи тарификации

Построить обобщенную линейную модель (GLM) для прогнозирования наступления страховых случаев на рассмотренных в ноутбуке данных. Подобрать необходимое распределение и тип связи, при необходимости ознакомиться с документацией H20. Придумать и использовать дополнительные факторы при построении модели (например, пересечения признаков или функции от них и т.д.). Оценить результаты построенной модели при помощи различных метрик (можно использовать и другие метрики помимо представленных в ноутбуке), проанализировать вероятные проблемы. Предложить способы их решения и/или попробовать их решить, улучшив результат.

In [1]:
import numpy as np
import pandas as pd

import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

In [2]:
# Загрузим набор данных

df = pd.read_csv('E:\GB\MachineLearning\MPL_R.csv', low_memory=False)
df = df.loc[df.Dataset.isin([5, 6, 7, 8, 9])]
df.drop('Dataset', axis=1, inplace=True)
df.dropna(axis=1, how='all', inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115155 entries, 0 to 115154
Data columns (total 20 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Exposure           115155 non-null  float64
 1   LicAge             115155 non-null  int64  
 2   RecordBeg          115155 non-null  object 
 3   RecordEnd          59455 non-null   object 
 4   Gender             115155 non-null  object 
 5   MariStat           115155 non-null  object 
 6   SocioCateg         115155 non-null  object 
 7   VehUsage           115155 non-null  object 
 8   DrivAge            115155 non-null  int64  
 9   HasKmLimit         115155 non-null  int64  
 10  BonusMalus         115155 non-null  int64  
 11  ClaimAmount        115155 non-null  float64
 12  ClaimInd           115155 non-null  int64  
 13  ClaimNbResp        115155 non-null  float64
 14  ClaimNbNonResp     115155 non-null  float64
 15  ClaimNbParking     115155 non-null  float64
 16  Cl

In [3]:
NegClaimAmount = df.loc[df.ClaimAmount < 0, ['ClaimAmount','ClaimInd']]
print('Unique values of ClaimInd:', NegClaimAmount.ClaimInd.unique())
NegClaimAmount.head()

Unique values of ClaimInd: [0]


Unnamed: 0,ClaimAmount,ClaimInd
82,-74.206042,0
175,-1222.585196,0
177,-316.288822,0
363,-666.75861,0
375,-1201.600604,0


In [4]:
df.loc[df.ClaimAmount < 0, 'ClaimAmount'] = 0

In [5]:
def SeriesFactorizer(series):
    series, unique = pd.factorize(series)
    reference = {x: i for x, i in enumerate(unique)}
    print(reference)
    return series, reference

In [6]:
df.Gender, GenderRef = SeriesFactorizer(df.Gender)

{0: 'Male', 1: 'Female'}


In [7]:
df.MariStat, MariStatRef = SeriesFactorizer(df.MariStat)

{0: 'Other', 1: 'Alone'}


In [8]:
list(df.VehUsage.unique())

['Professional', 'Private+trip to office', 'Private', 'Professional run']

In [9]:
VU_dummies = pd.get_dummies(df.VehUsage, prefix='VehUsg', drop_first=False)
VU_dummies.head()

Unnamed: 0,VehUsg_Private,VehUsg_Private+trip to office,VehUsg_Professional,VehUsg_Professional run
0,0,0,1,0
1,0,0,1,0
2,0,1,0,0
3,0,1,0,0
4,1,0,0,0


In [10]:
df['SocioCateg'] = df.SocioCateg.str.slice(0,4)
pd.DataFrame(df.SocioCateg.value_counts().sort_values()).rename({'SocioCateg': 'Frequency'}, axis=1)

Unnamed: 0,Frequency
CSP7,14
CSP3,1210
CSP1,2740
CSP2,3254
CSP4,7648
CSP6,24833
CSP5,75456


In [11]:
df = pd.get_dummies(df, columns=['VehUsage','SocioCateg'])
df = df.select_dtypes(exclude=['object'])
df['DrivAgeSq'] = df.DrivAge.apply(lambda x: x**2)
df['ClaimsCount'] = df.ClaimInd + df.ClaimNbResp + df.ClaimNbNonResp + df.ClaimNbParking + df.ClaimNbFireTheft + df.ClaimNbWindscreen
df.loc[df.ClaimAmount == 0, 'ClaimsCount'] = 0
df.drop(["ClaimNbResp", "ClaimNbNonResp", "ClaimNbParking", "ClaimNbFireTheft", "ClaimNbWindscreen"], axis=1, inplace=True)
pd.DataFrame(df.ClaimsCount.value_counts()).rename({'ClaimsCount': 'Policies'}, axis=1)

Unnamed: 0,Policies
0.0,104286
2.0,3529
1.0,3339
3.0,2310
4.0,1101
5.0,428
6.0,127
7.0,26
8.0,6
9.0,2


In [12]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
; OpenJDK 64-Bit Server VM (build 11.0.6+8-b765.1, mixed mode)
  Starting server from E:\Anaconda\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\alxz\AppData\Local\Temp\tmpi0068ddt
  JVM stdout: C:\Users\alxz\AppData\Local\Temp\tmpi0068ddt\h2o_alxz_started_from_python.out
  JVM stderr: C:\Users\alxz\AppData\Local\Temp\tmpi0068ddt\h2o_alxz_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,Europe/Moscow
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.3
H2O_cluster_version_age:,17 days
H2O_cluster_name:,H2O_from_python_alxz_cj7gzn
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.975 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [13]:
# Разбиение датасета на train/val/test

x_train_ind, x_test_ind, y_train_ind, y_test_ind = train_test_split(df.drop(['ClaimInd', 'ClaimAmount', 'ClaimsCount'], axis=1), df.ClaimInd, test_size=0.3, random_state=1)
x_valid_ind, x_test_ind, y_valid_ind, y_test_ind = train_test_split(x_test_ind, y_test_ind, test_size=0.5, random_state=1)
# Преобразование в H2O-Frame

h2o_train = h2o.H2OFrame(pd.concat([x_train_ind, y_train_ind], axis=1))
h2o_valid = h2o.H2OFrame(pd.concat([x_valid_ind, y_valid_ind], axis=1))
h2o_test = h2o.H2OFrame(pd.concat([x_test_ind, y_test_ind], axis=1))

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [14]:
# Преобразуем целевую переменную ClaimInd в категориальную при помощи метода asfactor во всех наборах данных

h2o_train['ClaimInd'] = h2o_train['ClaimInd'].asfactor()
h2o_valid['ClaimInd'] = h2o_valid['ClaimInd'].asfactor()
h2o_test['ClaimInd'] = h2o_test['ClaimInd'].asfactor()

# Инициализируем и обучим GLM модель c кросс-валидацией

glm_bin = H2OGeneralizedLinearEstimator(family='binomial', nfolds=5)

glm_bin.train(y='ClaimInd', 
              x=h2o_train.names[1:-1], 
              training_frame=h2o_train, 
              validation_frame=h2o_valid)

glm Model Build progress: |███████████████████████████████████████████████| 100%


In [15]:
# Параметры модели: распределение, функция связи, гиперпараметры регуляризации, количество использованных объясняющих переменных

glm_bin.summary()


GLM Model: summary


Unnamed: 0,Unnamed: 1,family,link,regularization,number_of_predictors_total,number_of_active_predictors,number_of_iterations,training_frame
0,,binomial,logit,"Elastic Net (alpha = 0.5, lambda = 2.368E-5 )",20,20,3,py_1_sid_a4fa




In [16]:
# Метрики качества модели - по всем данным и на кросс-валидации

glm_bin.cross_validation_metrics_summary().as_data_frame()

Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,accuracy,0.51300895,0.13439387,0.56817615,0.63552207,0.5613762,0.51467115,0.28529903
1,auc,0.56239814,0.0067942142,0.55610406,0.572971,0.5634278,0.556736,0.5627518
2,aucpr,0.11379059,0.005137532,0.10859945,0.120875135,0.110164665,0.11203099,0.11728267
3,err,0.48699108,0.13439387,0.43182385,0.3644779,0.4386238,0.48532885,0.714701
4,err_count,7852.2,2178.2942,6942.0,5906.0,7050.0,7807.0,11556.0
5,f0point5,0.13419105,0.008903488,0.13072228,0.14958471,0.13342626,0.13022842,0.12699355
6,f1,0.18769105,0.0075686187,0.18175389,0.20081191,0.18572418,0.18362439,0.1865409
7,f2,0.31429285,0.021167107,0.29814386,0.30540007,0.3054479,0.31123716,0.3512353
8,lift_top_group,1.3781933,0.3471435,0.87588537,1.6296892,1.1971895,1.4518408,1.7363617
9,logloss,0.31130546,0.004544429,0.30644086,0.31783515,0.30815113,0.3104304,0.31366983


In [17]:
# Таблица коэффициентов модели (в зависимости от модели могут выводиться также стандартная ошибка, z-score и p-value)

glm_bin._model_json['output']['coefficients_table'].as_data_frame()

Unnamed: 0,names,coefficients,standardized_coefficients
0,Intercept,-2.534238,-2.279643
1,LicAge,-0.000312,-0.049906
2,Gender,0.010311,0.004998
3,MariStat,-0.066143,-0.023827
4,DrivAge,-0.001085,-0.016265
5,HasKmLimit,-0.366059,-0.114316
6,BonusMalus,0.006341,0.097335
7,OutUseNb,0.061289,0.042639
8,RiskArea,0.009015,0.019975
9,VehUsage_Private,-0.146273,-0.06917


In [18]:
# Таблица нормированных коэффициентов по всем данным и на кросс-валидации

pmodels = {}
pmodels['overall'] = glm_bin.coef_norm()

for x in range(len(glm_bin.cross_validation_models())):
    pmodels[x] = glm_bin.cross_validation_models()[x].coef_norm()
    
pd.DataFrame.from_dict(pmodels).round(5)

Unnamed: 0,overall,0,1,2,3,4
Intercept,-2.27964,-2.27446,-2.28772,-2.27544,-2.27983,-2.28274
LicAge,-0.04991,-0.02591,-0.05764,-0.03492,-0.05893,-0.07073
Gender,0.005,-0.00431,-0.00104,0.00823,0.01002,0.0136
MariStat,-0.02383,-0.0259,-0.00304,-0.02877,-0.03449,-0.02938
DrivAge,-0.01626,-0.01973,0.0011,-0.01113,-0.12085,-0.02091
HasKmLimit,-0.11432,-0.11831,-0.11326,-0.10168,-0.13393,-0.10649
BonusMalus,0.09734,0.10697,0.0842,0.11024,0.08678,0.09405
OutUseNb,0.04264,0.04461,0.04279,0.04037,0.04639,0.03951
RiskArea,0.01997,0.02485,0.00903,0.02089,0.01975,0.02601
VehUsage_Private,-0.06917,-0.06751,-0.07527,-0.05733,-0.0551,-0.07622


In [19]:
# Построение прогнозных значений для обучающей, валидационной и тестовой выборок

train_pred = glm_bin.predict(h2o_train).as_data_frame()
valid_pred = glm_bin.predict(h2o_valid).as_data_frame()
test_pred = glm_bin.predict(h2o_test).as_data_frame()

glm prediction progress: |████████████████████████████████████████████████| 100%
glm prediction progress: |████████████████████████████████████████████████| 100%
glm prediction progress: |████████████████████████████████████████████████| 100%


In [20]:
# Выведем импортированные выше метрики классификации для обучающей, валидационной и тестовой выборок

def show_scores(y_true, y_pred):
    print(f'Accuracy: {accuracy_score(y_true, y_pred)}\n')
    print(f'F1-score: {accuracy_score(y_true, y_pred)}\n')
    print(f'Confusion matrix:\n{confusion_matrix(y_true, y_pred)}')
    
show_scores(y_train_ind, train_pred['predict'])

Accuracy: 0.5789623858674077

F1-score: 0.5789623858674077

Confusion matrix:
[[42807 30166]
 [ 3773  3862]]


In [21]:
show_scores(y_valid_ind, valid_pred['predict'])

Accuracy: 0.5829329010594569

F1-score: 0.5829329010594569

Confusion matrix:
[[9243 6421]
 [ 783  826]]


In [22]:
show_scores(y_test_ind, test_pred['predict'])

Accuracy: 0.5762996410790784

F1-score: 0.5762996410790784

Confusion matrix:
[[9126 6523]
 [ 796  829]]


Можно попробовать избавиться от дизбаланса классов. Возможно, лучше почистить данные. Также можно попробовать другие модели.