## Практическое задание

Построить обобщенную линейную модель (GLM) для прогнозирования наступления страховых случаев на рассмотренных в ноутбуке данных. Подобрать необходимое распределение и тип связи, при необходимости ознакомиться с документацией H20. Придумать и использовать дополнительные факторы при построении модели (например, пересечения признаков или функции от них и т.д.). Оценить результаты построенной модели при помощи различных метрик (можно использовать и другие метрики помимо представленных в ноутбуке), проанализировать вероятные проблемы. Предложить способы их решения и/или попробовать их решить, улучшив результат.

## Предобработка данных

In [104]:
import numpy as np
import pandas as pd

In [105]:
import matplotlib

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [106]:
matplotlib.rcParams['figure.figsize'] =  [15.0, 8.0]


Цель - смоделировать факт наступления страхового случая

In [107]:
# Загрузим набор данных
path = 'C:/Users/user/Documents/mydocs/gb_timeseries/insurance/data/'
df = pd.read_csv(path+'freMPL-R.csv', low_memory=False)
df = df.loc[df.Dataset.isin([5, 6, 7, 8, 9])]
df.drop('Dataset', axis=1, inplace=True)
df.dropna(axis=1, how='all', inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115155 entries, 0 to 115154
Data columns (total 20 columns):
Exposure             115155 non-null float64
LicAge               115155 non-null int64
RecordBeg            115155 non-null object
RecordEnd            59455 non-null object
Gender               115155 non-null object
MariStat             115155 non-null object
SocioCateg           115155 non-null object
VehUsage             115155 non-null object
DrivAge              115155 non-null int64
HasKmLimit           115155 non-null int64
BonusMalus           115155 non-null int64
ClaimAmount          115155 non-null float64
ClaimInd             115155 non-null int64
ClaimNbResp          115155 non-null float64
ClaimNbNonResp       115155 non-null float64
ClaimNbParking       115155 non-null float64
ClaimNbFireTheft     115155 non-null float64
ClaimNbWindscreen    115155 non-null float64
OutUseNb             115155 non-null float64
RiskArea             115155 non-null float64
dtypes

Exposure - доля года,  за который полис действовал

LicAge - период полиса

RecordBeg - стартовая дата полиса

RecordEnd  - дата окончания полиса

DrivAge - возраст водителя

SocioCateg - социальная категория 

Gender - пол 

MariStat - семейный статус 

VehUsage - тип использования транспортного средства 

ClaimInd  - случилось страховое событие или нет

ClaimAmount - величина убытка 

Виды ущербов -  ("ClaimNbResp", "ClaimNbNonResp", "ClaimNbParking", "ClaimNbFireTheft", "ClaimNbWindscreen")

In [108]:
df.head()

Unnamed: 0,Exposure,LicAge,RecordBeg,RecordEnd,Gender,MariStat,SocioCateg,VehUsage,DrivAge,HasKmLimit,BonusMalus,ClaimAmount,ClaimInd,ClaimNbResp,ClaimNbNonResp,ClaimNbParking,ClaimNbFireTheft,ClaimNbWindscreen,OutUseNb,RiskArea
0,0.083,332,2004-01-01,2004-02-01,Male,Other,CSP50,Professional,46,0,50,0.0,0,0.0,1.0,0.0,0.0,0.0,0.0,9.0
1,0.916,333,2004-02-01,,Male,Other,CSP50,Professional,46,0,50,0.0,0,0.0,1.0,0.0,0.0,0.0,0.0,9.0
2,0.55,173,2004-05-15,2004-12-03,Male,Other,CSP50,Private+trip to office,32,0,68,0.0,0,0.0,2.0,0.0,0.0,0.0,0.0,7.0
3,0.089,364,2004-11-29,,Female,Other,CSP55,Private+trip to office,52,0,50,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,8.0
4,0.233,426,2004-02-07,2004-05-01,Male,Other,CSP60,Private,57,0,50,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,7.0


## первые наблюдения

1 ) Есть строковые переменные, которые мы не можем использовать - object

2 ) В предыдущем уроке мы заметили отрицательную величину убытка для некоторых наблюдений. Заметим, что для всех таких полисов переменная "ClaimInd" принимает только значение 0.

In [109]:
NegClaimAmount = df.loc[df.ClaimAmount < 0, ['ClaimAmount','ClaimInd']]
print('Unique values of ClaimInd:', NegClaimAmount.ClaimInd.unique())
NegClaimAmount.head()

Unique values of ClaimInd: [0]


Unnamed: 0,ClaimAmount,ClaimInd
82,-74.206042,0
175,-1222.585196,0
177,-316.288822,0
363,-666.75861,0
375,-1201.600604,0


In [110]:
#Заменим все отрицательные значения "ClaimAmount" нулями.
df.loc[df.ClaimAmount < 0, 'ClaimAmount'] = 0

## перекодируем бинарные  переменные типа `object` с помощью числовых значений

In [111]:
def SeriesFactorizer(series):
    series, unique = pd.factorize(series)
    reference = {x: i for x, i in enumerate(unique)}
    print(reference)
    return series, reference

In [112]:
df.Gender, GenderRef = SeriesFactorizer(df.Gender)

{0: 'Male', 1: 'Female'}


In [113]:
df.MariStat, MariStatRef = SeriesFactorizer(df.MariStat)

{0: 'Other', 1: 'Alone'}


## преобразуем дату 

In [114]:
df.head()

Unnamed: 0,Exposure,LicAge,RecordBeg,RecordEnd,Gender,MariStat,SocioCateg,VehUsage,DrivAge,HasKmLimit,BonusMalus,ClaimAmount,ClaimInd,ClaimNbResp,ClaimNbNonResp,ClaimNbParking,ClaimNbFireTheft,ClaimNbWindscreen,OutUseNb,RiskArea
0,0.083,332,2004-01-01,2004-02-01,0,0,CSP50,Professional,46,0,50,0.0,0,0.0,1.0,0.0,0.0,0.0,0.0,9.0
1,0.916,333,2004-02-01,,0,0,CSP50,Professional,46,0,50,0.0,0,0.0,1.0,0.0,0.0,0.0,0.0,9.0
2,0.55,173,2004-05-15,2004-12-03,0,0,CSP50,Private+trip to office,32,0,68,0.0,0,0.0,2.0,0.0,0.0,0.0,0.0,7.0
3,0.089,364,2004-11-29,,1,0,CSP55,Private+trip to office,52,0,50,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,8.0
4,0.233,426,2004-02-07,2004-05-01,0,0,CSP60,Private,57,0,50,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,7.0


In [115]:
df[['RecordBeg', 'RecordEnd']] =  df[['RecordBeg', 'RecordEnd']].apply(pd.to_datetime)

In [116]:
#df['year_start'] = df['RecordBeg'].dt.year
df['month_start'] = df['RecordBeg'].dt.month
df['dayofmonth_start'] = df['RecordBeg'].dt.day

In [117]:
df.head()

Unnamed: 0,Exposure,LicAge,RecordBeg,RecordEnd,Gender,MariStat,SocioCateg,VehUsage,DrivAge,HasKmLimit,...,ClaimInd,ClaimNbResp,ClaimNbNonResp,ClaimNbParking,ClaimNbFireTheft,ClaimNbWindscreen,OutUseNb,RiskArea,month_start,dayofmonth_start
0,0.083,332,2004-01-01,2004-02-01,0,0,CSP50,Professional,46,0,...,0,0.0,1.0,0.0,0.0,0.0,0.0,9.0,1,1
1,0.916,333,2004-02-01,NaT,0,0,CSP50,Professional,46,0,...,0,0.0,1.0,0.0,0.0,0.0,0.0,9.0,2,1
2,0.55,173,2004-05-15,2004-12-03,0,0,CSP50,Private+trip to office,32,0,...,0,0.0,2.0,0.0,0.0,0.0,0.0,7.0,5,15
3,0.089,364,2004-11-29,NaT,1,0,CSP55,Private+trip to office,52,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,11,29
4,0.233,426,2004-02-07,2004-05-01,0,0,CSP60,Private,57,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,2,7


## перекодируем небинарные категориальные признаки с помощью числовых значений

Для переменных, содержащих более 2 значений, различия между которыми не могут упорядочены, используем фиктивные переменные (one-hot encoding).

**NB**: В H2O не рекомендуется использовать one-hot encoding, поскольку данный фреймворк корректно работает с категориальными признаками, тогда как применение one-hot encoding приводит к неэффективности. Тем не менее, используем здесь фиктивные переменные, чтобы в дальнейшем сохранить возможность сравнения результатов построенных моделей.

In [118]:
list(df.VehUsage.unique())

['Professional', 'Private+trip to office', 'Private', 'Professional run']

In [119]:
VU_dummies = pd.get_dummies(df.VehUsage, prefix='VehUsg', drop_first=False)
VU_dummies.head()

Unnamed: 0,VehUsg_Private,VehUsg_Private+trip to office,VehUsg_Professional,VehUsg_Professional run
0,0,0,1,0
1,0,0,1,0
2,0,1,0,0
3,0,1,0,0
4,1,0,0,0


Фактор "SocioCateg" содержит информацию о социальной категории в виде кодов классификации CSP. Агрегируем имеющиеся коды до 1 знака, а затем закодируем их с помощью one-hot encoding.

[Wiki](https://fr.wikipedia.org/wiki/Professions_et_cat%C3%A9gories_socioprofessionnelles_en_France#Cr%C3%A9ation_de_la_nomenclature_des_PCS)

[Более подробный классификатор](https://www.ast74.fr/upload/administratif/liste-des-codes-csp-copie.pdf)

In [120]:
df['SocioCateg'].unique()

array(['CSP50', 'CSP55', 'CSP60', 'CSP48', 'CSP6', 'CSP66', 'CSP1',
       'CSP46', 'CSP21', 'CSP47', 'CSP42', 'CSP37', 'CSP22', 'CSP3',
       'CSP49', 'CSP20', 'CSP2', 'CSP40', 'CSP7', 'CSP26', 'CSP65',
       'CSP41', 'CSP17', 'CSP57', 'CSP56', 'CSP38', 'CSP51', 'CSP59',
       'CSP30', 'CSP44', 'CSP61', 'CSP63', 'CSP45', 'CSP16', 'CSP43',
       'CSP39', 'CSP5', 'CSP32', 'CSP35', 'CSP73', 'CSP62', 'CSP52',
       'CSP27', 'CSP24', 'CSP19', 'CSP70'], dtype=object)

In [121]:
#выделим только первые 4 знака, чтобы сократить количество социальных категорий

In [122]:
df['SocioCateg'] = df.SocioCateg.str.slice(0,4)

In [123]:
pd.DataFrame(df.SocioCateg.value_counts().sort_values()).rename({'SocioCateg': 'Frequency'}, axis=1)

Unnamed: 0,Frequency
CSP7,14
CSP3,1210
CSP1,2740
CSP2,3254
CSP4,7648
CSP6,24833
CSP5,75456


In [124]:
df = pd.get_dummies(df, columns=['VehUsage','SocioCateg', 'month_start', 'dayofmonth_start'])

Теперь, когда большинство переменных типа `object` обработаны, исключим их из набора данных за ненадобностью.

In [125]:
df = df.select_dtypes(exclude=['object', 'datetime64[ns]'])

## Добавим  признаки

Создадим дополнительные признаки на основе возраста, длительности лицензии при помощи натурального логарифма и вовзедения в квадрат

In [126]:
df['DrivAgeSq'] = df.DrivAge.apply(lambda x: x**2)
df['LicAgeSq'] = df.LicAge.apply(lambda x: x**2)
df['DrivAgeLog'] = df.DrivAge.apply(lambda x: np.log(x))
df['LicAgeLog'] = df.LicAge.apply(lambda x: np.log(x))

In [127]:
df.head()

Unnamed: 0,Exposure,LicAge,Gender,MariStat,DrivAge,HasKmLimit,BonusMalus,ClaimAmount,ClaimInd,ClaimNbResp,...,dayofmonth_start_26,dayofmonth_start_27,dayofmonth_start_28,dayofmonth_start_29,dayofmonth_start_30,dayofmonth_start_31,DrivAgeSq,LicAgeSq,DrivAgeLog,LicAgeLog
0,0.083,332,0,0,46,0,50,0.0,0,0.0,...,0,0,0,0,0,0,2116,110224,3.828641,5.805135
1,0.916,333,0,0,46,0,50,0.0,0,0.0,...,0,0,0,0,0,0,2116,110889,3.828641,5.808142
2,0.55,173,0,0,32,0,68,0.0,0,0.0,...,0,0,0,0,0,0,1024,29929,3.465736,5.153292
3,0.089,364,1,0,52,0,50,0.0,0,0.0,...,0,0,0,1,0,0,2704,132496,3.951244,5.897154
4,0.233,426,0,0,57,0,50,0.0,0,0.0,...,0,0,0,0,0,0,3249,181476,4.043051,6.054439


In [128]:
#исключим типы компенсаций 
df.drop(["ClaimNbResp", "ClaimNbNonResp", "ClaimNbParking", "ClaimNbFireTheft", "ClaimNbWindscreen"], axis=1, inplace=True)

## Разделение набора данных на обучающую, валидационную и тестовую выборки

In [129]:
from sklearn.model_selection import train_test_split

In [130]:
# Разбиение датасета для частоты на train/val/test

x_train, x_test, y_train, y_test = train_test_split(df.drop(['ClaimInd', 'ClaimAmount'], axis=1), df.ClaimInd, test_size=0.3, random_state=1)
x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=1)

### Установка H2O и инициализация

In [131]:
#!apt-get install default-jre

In [132]:
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,37 mins 03 secs
H2O_cluster_timezone:,Europe/Moscow
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.3
H2O_cluster_version_age:,18 days
H2O_cluster_name:,H2O_from_python_user_k5nh9y
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.973 Gb
H2O_cluster_total_cores:,6
H2O_cluster_allowed_cores:,6


### Построение GLM для частоты страховых случаев

суть модели

https://medium.com/@StepUpAnalytics/h20-package-classification-using-logistic-regression-712b0179b926

In [133]:

# Преобразование в H2O-Frame

h2o_train = h2o.H2OFrame(pd.concat([x_train, y_train], axis=1))
h2o_valid = h2o.H2OFrame(pd.concat([x_valid, y_valid], axis=1))
h2o_test = h2o.H2OFrame(pd.concat([x_test, y_test], axis=1))

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [134]:
# Преобразуем целевую переменную ClaimInd в категориальную при помощи метода asfactor во всех наборах данных

h2o_train['ClaimInd'] = h2o_train['ClaimInd'].asfactor()
h2o_valid['ClaimInd'] = h2o_valid['ClaimInd'].asfactor()
h2o_test['ClaimInd'] = h2o_test['ClaimInd'].asfactor()

In [135]:
# Инициализируем и обучим GLM модель c кросс-валидацией
# семейство распределения - биноминальное, так как у нас всего 2 варианта - либо состоялся страхойо случай, либо нет 
# функция связи - логистическая регрессия 

glm = H2OGeneralizedLinearEstimator(family="binomial", link="logit", nfolds=5)
glm.train(y="ClaimInd", x=h2o_train.names[:-1], training_frame=h2o_train, validation_frame=h2o_valid)

glm Model Build progress: |███████████████████████████████████████████████| 100%


In [None]:
# Инициализируем и обучим 

## справка по видам семейства:

https://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/algo-params/family.html
    
binomial: The data must be categorical 2 levels/classes or binary (Enum or Int).

## справка по функции связи:
    
https://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/algo-params/link.html
    
с учетом биноминального распределления, нам подходит функция связи логистической регрессии 

In [136]:
# Параметры модели: распределение, функция связи, гиперпараметры регуляризации, количество использованных объясняющих переменных

glm.summary()


GLM Model: summary


Unnamed: 0,Unnamed: 1,family,link,regularization,number_of_predictors_total,number_of_active_predictors,number_of_iterations,training_frame
0,,binomial,logit,"Elastic Net (alpha = 0.5, lambda = 1.029E-4 )",67,63,5,py_4_sid_8942




In [137]:
# Метрики качества модели - по всем данным и на кросс-валидации

glm.cross_validation_metrics_summary().as_data_frame()

Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,accuracy,0.71130764,0.028220413,0.7291731,0.687867,0.6949481,0.6922024,0.75234765
1,auc,0.684664,0.0068552014,0.690378,0.69080323,0.6756409,0.67923564,0.6872624
2,aucpr,0.16957854,0.007300265,0.17739083,0.16986564,0.1587592,0.16698222,0.17489478
3,err,0.28869236,0.028220413,0.27082685,0.312133,0.3050519,0.3077976,0.24765237
4,err_count,4652.6,437.7377,4353.0,5050.0,4879.0,4946.0,4035.0
5,f0point5,0.1937824,0.011673491,0.20403303,0.19022703,0.1779729,0.19009665,0.20658243
6,f1,0.25219968,0.009853925,0.2615776,0.25362104,0.2361046,0.25128672,0.25840837
7,f2,0.36217323,0.014464095,0.3643323,0.3803866,0.35063243,0.3705688,0.34494603
8,lift_top_group,2.71616,0.26369414,2.8760905,2.9217396,2.3358934,2.5424497,2.904627
9,logloss,0.29482648,0.0050621824,0.29848382,0.29000166,0.2887727,0.29718694,0.2996873


In [138]:
# Таблица коэффициентов модели (в зависимости от модели могут выводиться также стандартная ошибка, z-score и p-value)

glm._model_json['output']['coefficients_table'].as_data_frame()

Unnamed: 0,names,coefficients,standardized_coefficients
0,Intercept,-2.407298e+00,-2.470921
1,Exposure,1.802368e+00,0.521268
2,LicAge,6.159273e-04,0.098364
3,Gender,2.789987e-03,0.001352
4,MariStat,-1.003314e-01,-0.036143
...,...,...,...
63,dayofmonth_start_31,0.000000e+00,0.000000
64,DrivAgeSq,-3.721374e-05,-0.057330
65,LicAgeSq,1.671893e-07,0.018980
66,DrivAgeLog,1.388924e-01,0.044751


In [139]:
# Таблица нормированных коэффициентов по всем данным и на кросс-валидации

pmodels = {}
pmodels['overall'] = glm.coef_norm()
for x in range(len(glm.cross_validation_models())):
    pmodels[x] = glm.cross_validation_models()[x].coef_norm()
pd.DataFrame.from_dict(pmodels).round(5)

Unnamed: 0,overall,0,1,2,3,4
Intercept,-2.47092,-2.47234,-2.46216,-2.46525,-2.47773,-2.48123
Exposure,0.52127,0.52149,0.51829,0.52720,0.52706,0.51290
LicAge,0.09836,0.08516,0.08128,0.10781,0.08247,0.11675
Gender,0.00135,0.00416,-0.00214,0.00572,0.00000,0.00053
MariStat,-0.03614,-0.03296,-0.02687,-0.03083,-0.05200,-0.03840
...,...,...,...,...,...,...
dayofmonth_start_31,0.00000,0.00056,-0.00182,0.00628,0.00185,-0.00920
DrivAgeSq,-0.05733,-0.07010,-0.06639,-0.05424,-0.02838,-0.00910
LicAgeSq,0.01898,0.01094,0.04520,0.00000,0.02679,0.00000
DrivAgeLog,0.04475,0.06594,0.04027,0.00761,0.03263,0.01404


In [140]:
# Построение прогнозных значений для обучающей, валидационной и тестовой выборок

train_pred = glm.predict(h2o_train).as_data_frame()
valid_pred = glm.predict(h2o_valid).as_data_frame()
test_pred = glm.predict(h2o_test).as_data_frame()

glm prediction progress: |████████████████████████████████████████████████| 100%
glm prediction progress: |████████████████████████████████████████████████| 100%
glm prediction progress: |████████████████████████████████████████████████| 100%


In [141]:
# Сохранение обученной модели

model_glm = h2o.save_model(model=glm, path=path, force=True)

In [142]:
model_glm

'C:\\Users\\user\\Documents\\mydocs\\gb_timeseries\\insurance\\data\\GLM_model_python_1591015052400_20'

In [143]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score, roc_auc_score, log_loss

In [144]:
# Выведем импортированные выше метрики классификации для обучающей, валидационной и тестовой выборок

print(f'Train Accuracy: {np.round(accuracy_score(y_train, train_pred["predict"].values), 4)}')
print(f'Valid Accuracy: {np.round(accuracy_score(y_valid, valid_pred["predict"].values), 4)}')
print(f'Test Accuracy: {np.round(accuracy_score(y_test, test_pred["predict"].values), 4)}')
print()

print(f'Train F1: {np.round(f1_score(y_train, train_pred["predict"].values), 4)}')
print(f'Valid F1: {np.round(f1_score(y_valid, valid_pred["predict"].values), 4)}')
print(f'Test F1: {np.round(f1_score(y_test, test_pred["predict"].values), 4)}')
print()

print(f'Train Precision: {np.round(precision_score(y_train, train_pred["predict"].values), 4)}')
print(f'Valid Precision: {np.round(precision_score(y_valid, valid_pred["predict"].values), 4)}')
print(f'Test Precision: {np.round(precision_score(y_test, test_pred["predict"].values), 4)}')
print()

print(f'Train Recall: {np.round(recall_score(y_train, train_pred["predict"].values), 4)}')
print(f'Valid Recall: {np.round(recall_score(y_valid, valid_pred["predict"].values), 4)}')
print(f'Test Recall: {np.round(recall_score(y_test, test_pred["predict"].values), 4)}')
print()

print(f'Train ROC AUC: {np.round(roc_auc_score(y_train, train_pred["p1"].values), 4)}')
print(f'Valid ROC AUC: {np.round(roc_auc_score(y_valid, valid_pred["p1"].values), 4)}')
print(f'Test ROC AUC: {np.round(roc_auc_score(y_test, test_pred["p1"].values), 4)}')
print()

print(f'Train Log Loss: {np.round(log_loss(y_train, train_pred["p1"].values), 4)}')
print(f'Valid Log Loss: {np.round(log_loss(y_valid, valid_pred["p1"].values), 4)}')
print(f'Test Log Loss: {np.round(log_loss(y_test, test_pred["p1"].values), 4)}')
print()

Train Accuracy: 0.7391
Valid Accuracy: 0.7388
Test Accuracy: 0.7397

Train F1: 0.2518
Valid F1: 0.2488
Test F1: 0.2548

Train Precision: 0.1729
Valid Precision: 0.1699
Test Precision: 0.1744

Train Recall: 0.4637
Valid Recall: 0.4643
Test Recall: 0.4732

Train ROC AUC: 0.689
Valid ROC AUC: 0.6875
Test ROC AUC: 0.6914

Train Log Loss: 0.2938
Valid Log Loss: 0.291
Test Log Loss: 0.2919



## Какие проблемы вы здесь видите? Как можно улучшить данный результат?

По всем показателям можно постараться улучшить результат, но более всего просидает Precision - доля верно предсказанных страховых случаев. Попробуем изменить гиперпараметр модели и добавить в качестве веса - количество дней страховки

In [182]:
glm2 = H2OGeneralizedLinearEstimator(family="binomial", link="logit", nfolds=5)
glm2.train(y="ClaimInd", x=h2o_train.names[1:-1], training_frame=h2o_train, validation_frame=h2o_valid, weights_column = "Exposure")

glm Model Build progress: |███████████████████████████████████████████████| 100%


In [184]:

glm2.summary()


GLM Model: summary


Unnamed: 0,Unnamed: 1,family,link,regularization,number_of_predictors_total,number_of_active_predictors,number_of_iterations,training_frame
0,,binomial,logit,"Elastic Net (alpha = 0.5, lambda = 3.302E-5 )",66,64,3,py_4_sid_8942




In [185]:
# Метрики качества модели - по всем данным и на кросс-валидации

glm2.cross_validation_metrics_summary().as_data_frame()

Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,accuracy,0.5359518,0.048730142,0.5193786,0.45860222,0.57455367,0.55311054,0.57411397
1,auc,0.6023993,0.0049678762,0.60341203,0.59603465,0.5986688,0.6059344,0.60794646
2,aucpr,0.1713043,0.005774442,0.17296585,0.16561906,0.16856928,0.16885385,0.18051347
3,err,0.4640482,0.048730142,0.4806214,0.5413978,0.42544633,0.4468895,0.42588603
4,err_count,3308.83,346.73248,3407.177,3870.272,3062.855,3172.207,3031.639
5,f0point5,0.18949234,0.0049930057,0.18986218,0.18189593,0.19052929,0.18929729,0.19587708
6,f1,0.25592262,0.003391417,0.25855225,0.2523115,0.25340632,0.2550635,0.26027942
7,f2,0.39473197,0.013519329,0.4051205,0.4116818,0.37822548,0.3908558,0.38777626
8,lift_top_group,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,logloss,0.3750319,0.0026490802,0.3756686,0.37627146,0.37533587,0.37050232,0.3773812


In [187]:
# Таблица коэффициентов модели (в зависимости от модели могут выводиться также стандартная ошибка, z-score и p-value)

glm2._model_json['output']['coefficients_table'].as_data_frame()

Unnamed: 0,names,coefficients,standardized_coefficients
0,Intercept,-1.101700e+00,-1.991646
1,LicAge,9.364394e-04,0.149342
2,Gender,1.326592e-02,0.006424
3,MariStat,-7.733430e-02,-0.027653
4,DrivAge,3.935493e-03,0.058905
...,...,...,...
62,dayofmonth_start_31,-5.081046e-02,-0.003213
63,DrivAgeSq,-9.072369e-05,-0.140200
64,LicAgeSq,1.454269e-07,0.016607
65,DrivAgeLog,1.597456e-01,0.051089


In [189]:
# Таблица нормированных коэффициентов по всем данным и на кросс-валидации

pmodels2 = {}
pmodels2['overall'] = glm2.coef_norm()
for x in range(len(glm2.cross_validation_models())):
    pmodels2[x] = glm2.cross_validation_models()[x].coef_norm()
pd.DataFrame.from_dict(pmodels2).round(5)

Unnamed: 0,overall,0,1,2,3,4
Intercept,-1.99165,-1.99446,-1.99416,-1.99275,-1.98641,-1.99518
LicAge,0.14934,0.10667,0.05479,0.10247,0.23374,0.06248
Gender,0.00642,0.02090,0.00354,0.00314,0.00480,0.00056
MariStat,-0.02765,-0.02305,-0.02396,-0.02997,-0.01861,-0.04287
DrivAge,0.05891,0.02998,0.01463,0.03957,0.04753,0.04682
...,...,...,...,...,...,...
dayofmonth_start_31,-0.00321,0.00934,-0.00099,-0.01366,-0.00035,-0.00333
DrivAgeSq,-0.14020,-0.16120,-0.07971,-0.12984,-0.09603,-0.07554
LicAgeSq,0.01661,0.07281,0.08066,0.01316,-0.05739,0.04955
DrivAgeLog,0.05109,0.08340,0.03790,0.07947,0.02178,0.00000


In [190]:
# Построение прогнозных значений для обучающей, валидационной и тестовой выборок

train_pred2 = glm2.predict(h2o_train).as_data_frame()
valid_pred2 = glm2.predict(h2o_valid).as_data_frame()
test_pred2 = glm2.predict(h2o_test).as_data_frame()

glm prediction progress: |████████████████████████████████████████████████| 100%
glm prediction progress: |████████████████████████████████████████████████| 100%
glm prediction progress: |████████████████████████████████████████████████| 100%


In [191]:
# Сохранение обученной модели

model_glm2 = h2o.save_model(model=glm2, path=path, force=True)

In [192]:
model_glm2

'C:\\Users\\user\\Documents\\mydocs\\gb_timeseries\\insurance\\data\\GLM_model_python_1591015052400_39'

In [193]:
# Выведем импортированные выше метрики классификации для обучающей, валидационной и тестовой выборок

print(f'Train Accuracy: {np.round(accuracy_score(y_train, train_pred2["predict"].values), 4)}')
print(f'Valid Accuracy: {np.round(accuracy_score(y_valid, valid_pred2["predict"].values), 4)}')
print(f'Test Accuracy: {np.round(accuracy_score(y_test, test_pred2["predict"].values), 4)}')
print()

print(f'Train F1: {np.round(f1_score(y_train, train_pred2["predict"].values), 4)}')
print(f'Valid F1: {np.round(f1_score(y_valid, valid_pred2["predict"].values), 4)}')
print(f'Test F1: {np.round(f1_score(y_test, test_pred2["predict"].values), 4)}')
print()

print(f'Train Precision: {np.round(precision_score(y_train, train_pred2["predict"].values), 4)}')
print(f'Valid Precision: {np.round(precision_score(y_valid, valid_pred2["predict"].values), 4)}')
print(f'Test Precision: {np.round(precision_score(y_test, test_pred2["predict"].values), 4)}')
print()

print(f'Train Recall: {np.round(recall_score(y_train, train_pred2["predict"].values), 4)}')
print(f'Valid Recall: {np.round(recall_score(y_valid, valid_pred2["predict"].values), 4)}')
print(f'Test Recall: {np.round(recall_score(y_test, test_pred2["predict"].values), 4)}')
print()

print(f'Train ROC AUC: {np.round(roc_auc_score(y_train, train_pred2["p1"].values), 4)}')
print(f'Valid ROC AUC: {np.round(roc_auc_score(y_valid, valid_pred2["p1"].values), 4)}')
print(f'Test ROC AUC: {np.round(roc_auc_score(y_test, test_pred2["p1"].values), 4)}')
print()

print(f'Train Log Loss: {np.round(log_loss(y_train, train_pred2["p1"].values), 4)}')
print(f'Valid Log Loss: {np.round(log_loss(y_valid, valid_pred2["p1"].values), 4)}')
print(f'Test Log Loss: {np.round(log_loss(y_test, test_pred2["p1"].values), 4)}')
print()

Train Accuracy: 0.6508
Valid Accuracy: 0.6484
Test Accuracy: 0.6505

Train F1: 0.2134
Valid F1: 0.2108
Test F1: 0.2101

Train Precision: 0.1357
Valid Precision: 0.1332
Test Precision: 0.1334

Train Recall: 0.5002
Valid Recall: 0.504
Test Recall: 0.4942

Train ROC AUC: 0.6337
Valid ROC AUC: 0.6316
Test ROC AUC: 0.6277

Train Log Loss: 0.3043
Valid Log Loss: 0.3016
Test Log Loss: 0.3036



Резюме:
    
Вторая модель не показала лучше результат, хотя метрика Recall стала лучше на 4%. 