### 1. Data loading

In [1]:
!pip install -q pytorch-tabnet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import  pickle
import gc
import time
import re
from tqdm import tqdm

import optuna
from optuna import distributions
import logging
import warnings

import torch
from torch.utils.data import WeightedRandomSampler
from torch.utils.data import Dataset, DataLoader
from torch import nn
import torch.nn.functional as F
import torch.optim as optim

import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.metrics import (f1_score,
                             accuracy_score,
                             precision_score,
                             recall_score,
                             
                             r2_score,
                             mean_absolute_error)
from sklearn.preprocessing import( StandardScaler,
                                   MinMaxScaler,
                                   OneHotEncoder,
                                   OrdinalEncoder              
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer 

from collections import Counter
from category_encoders import HashingEncoder, TargetEncoder

RANDOM_STATE = 51

### 2. Data preprocessing

In [3]:

def missing(data):
    missing = pd.DataFrame(data.isna().sum(),columns=["missing_values"]).reset_index().query('missing_values != 0 ')
    missing.rename(columns={'index': 'column_name'}, inplace=True)
    return missing.reset_index(drop=True)
    

def score( y_true,y_pred):
    print('accuracy: ',accuracy_score( y_true, y_pred))
    print('precision: ', precision_score( y_true, y_pred))
    print( 'recall: ', recall_score( y_true, y_pred))
    print('f1: ',f1_score(y_true, y_pred))          


In [4]:
try:
    path_data_train= Path(Path.cwd()/'adengi-internship_1/train.csv')
    path_data_test= Path(Path.cwd()/'adengi-internship_1/test.csv')
   
    df_train = pd.read_csv(path_data_train)
    df_test = pd.read_csv(path_data_test)
    
    
except:

    path_data_train = Path('/kaggle/input/a-money/train.csv')
    path_data_test = Path('/kaggle/input/a-money/test.csv')

    df_train = pd.read_csv(path_data_train)
    df_test = pd.read_csv(path_data_test)
    
print(path_data_train, '\n',path_data_test)


/kaggle/input/a-money/train.csv 
 /kaggle/input/a-money/test.csv


In [5]:
replace_dict = {
    'чуваш': 'Чувашская',
    'мари': 'Марий Эл',
     'луган': 'ЛНР',
    'донец': 'ДНР',
    'якут': 'Якутия',
    'осет':'Северная Осетия',
    'саха' : 'Якутия'
    
}


replace_dict_2  = {
    'ямал':  'ао Ямало-Ненецкий',
    'ненец': 'ао Ненецкий',
    'ханты' : 'ао Ханты-Мансийский',
     
    'чукотск':'ао Чукотский', 
    'еврейск': 'аобл Еврейская'
    
}

replace_dict_3 = {
    'москв':'обл Московская',
    'севаст':'респ Крым',
    ' санкт':'обл Ленинградская'
}
def change(text):
    text =' '.join(text)
    string= False
    for i in replace_dict:
        if  re.search( rf'(?i)\w*{i}\w*', text):
            string = replace_dict.get(i)
    if  string:
        return 'респ '+string
    return 'респ ' + text   

def  change_2(text): 
    for i in replace_dict_2:
        if re.search( rf'(?i)\w*{i}\w*', text):
            return replace_dict_2.get(i) 

def  change_3(text): 
    for i in replace_dict_3:
        if re.search( rf'(?i)\w*{i}\w*', text):
            return replace_dict_3.get(i) 
      

def region_1(string):
    # string = string.astype('str')
    if re.search(r'(?i)\bобл\b|\bобласть\b', string):
        # print('обл')
        return 'обл '+set(re.findall(r'\b\w*кая\b', string)).pop().capitalize()
    #elif re.search(r'\bг\b|\bрайон\b', string):
        # print('г')
        #return 'г '+ set(re.sub(r'(?i)\bг\b|\bрайон\b' ,'', string).split()).pop()
    elif re.search(r'\bг\b|\bрайон\b', string):
        #print('г')
        return change_3(string)
    
    elif  re.search(r'(?i)респ', string):
        # print('респ')
        resp = list(set(re.sub(r'(?i)\b\w*респ\w*\b', '', string).split()))
        return change(resp) 
    elif re.search(r'(?i)край',string) and not re.search(r'(?i)область', string):
        return 'край '+set(re.findall(r'\b\w*ский\b', string)).pop().capitalize()

    elif re.search(r'(?i)\bао\b|\bавтономный\b|\bеврейская\b', string):
        # print('AO',string, 'sss')
        return change_2(string)
    return 'unknown ' + string

In [6]:
%%time
df_train['region_new']= df_train['region'].map(region_1)
df_test['region_new'] = df_test['region'].map(region_1)

CPU times: user 33.8 s, sys: 364 ms, total: 34.2 s
Wall time: 34.2 s


In [7]:
region_train= df_train['region_new'].unique()
region_test = df_test['region_new'].unique()
print(f'Число регионов в test: {len(region_test)},', f'Число регионов в train: {len(region_train)}')

print('дельта test^train: ',set(df_test['region_new'])^set( df_train['region_new']))

Число регионов в test: 86, Число регионов в train: 86
дельта test^train:  set()


In [8]:
print('settlement unique(): ', len(df_train['settlement'].unique()))

N=7000 #  частота <10000 закодируем other
settlement= Counter(df_train['settlement'])
vocab_settelement = [i[0] for i in settlement.most_common(N)] 

def settlement_columns(data):
    #data['settlement_new'] =data['settlement'].apply(lambda x: x.lower() if x in vocab_settelement else 'others')   
    data['settlement_new'] =data['settlement'].where(data['settlement'].isin(vocab_settelement),'others')

settlement unique():  47357


In [9]:
%%time
settlement_columns(df_train)
settlement_columns(df_test)

CPU times: user 526 ms, sys: 15 µs, total: 526 ms
Wall time: 527 ms


In [10]:
def time_columns (data):
    data['created_at'] = pd.to_datetime(data['created_at'], errors='coerce')
    data['start_dt'] = pd.to_datetime(data['start_dt'], errors='coerce')
    
    data['start_dt_new'] = data['start_dt'].copy()  # для  реализации п. 2.1.3.1 в baseline не используется
     
    data['start_dt'] = data.groupby('client_id')['start_dt'].transform(
            lambda x: x.fillna(x.min()))
    
    data['start_dt'] = data['start_dt'].fillna(data['created_at'])

    data['month'] = data['created_at'].dt.month
    data['day'] = data['created_at'].dt.day
    data['week'] = data['created_at'].dt.dayofweek

    reference_date = pd.to_datetime('2022-01-01')

    # data['start_dt_xg'] = data['start_dt_new'].fillna(df_train['created_at'])
    # data['start_dt_xg'] =(data['start_dt_xg']-reference_date).dt.days
    
    data['created_at'] = (data['created_at'] - reference_date).dt.days
    data['start_dt'] = (data['start_dt'] - reference_date).dt.days
    
    data['days_diff'] = data['start_dt']-data['created_at']
    # data['days_diff_1'] = data['start_dt_new']-data['created_at']
    

In [11]:
%%time
time_columns(df_train)
time_columns(df_test)

CPU times: user 5min 36s, sys: 6.57 s, total: 5min 43s
Wall time: 5min 42s


In [12]:
df_train.select_dtypes(include='float').columns

Index(['monthly_income', 'work_experience', 'requested_sum',
       'main_agreement_amount', 'main_agreement_term', 'requested_period_days',
       'requested_amount', 'req_app_amount', 'approved_amount', 'period_days',
       'days_finish_loan', 'ag', 'cnt_ext', 'term', 'price', 'elecs_sum',
       'recurents_sum', 'tamount', 'issues', 'principal', 'interest',
       'overdue_interest', 'overdue_fee', 'nbki_score', 'contact_cases'],
      dtype='object')

- monthly_income - среднемесячный заработок клиента (зарплата)
- work_experience - кол-во лет стажа клиента
- requested_sum - запрашиваемая сумма клиента для займа, если interface - alfa
- requested_period_days - запрашиваемый срок по займу
- requested_amount - запрашиваемая сумма клиента по займу
- req_app_amount - разница между запрашиваемой суммой займа и одобренной
- cnt_ext - кол-во пролонгаций по займу
- term - срок пролонгации (список)
- price - цена пролонгации (список)
- elecs_sum - штрафы, пени
- recurents_sum - штрафы, пени (там вроде как различия в этапах их начисления)
- tamount - общий кэшфлоу клиента (общая сумма которая была на аккаунте клиента)
- nbki_score - скор клиента от рисков
- contact_cases - кол-во обращений клиента с коллекшн

In [13]:
df_test.loc[df_test['monthly_income']>10,'monthly_income'] =\
    df_test.loc[df_test['monthly_income']>10,'monthly_income']/1000

df_train.loc[df_train['monthly_income']>10,'monthly_income'] =\
    df_train.loc[df_train['monthly_income']>10,'monthly_income']/1000

In [14]:
def simpl_median(name):
    imp_median = SimpleImputer(strategy='median')
    df_train[name] = imp_median.fit_transform(df_train[[name]]) 
    df_test[name] = imp_median.transform(df_test[[name]])

In [15]:
simpl_median('monthly_income')

In [16]:
def min_simple(name):
    df_train[name]=df_train[name].fillna(df_train[name].min())
    df_test[name]=df_test[name].fillna(df_test[name].min())

In [17]:
df_train['cnt_ext'].min(), df_test['cnt_ext'].min()

(-0.7206081182984045, -0.7206081182984045)

In [18]:
min_simple('cnt_ext')

In [19]:
def zerro_simple(name):
    df_train[name]=df_train[name].fillna(0)
    df_test[name]=df_test[name].fillna(0)

In [20]:
zerro_simple('contact_cases')

In [21]:
min_simple('elecs_sum')
min_simple('recurents_sum')

In [22]:
simpl_median('nbki_score')

In [23]:
min_simple('price')

In [24]:
df_train['requested_sum'] = df_train['requested_sum'].fillna(
            df_train['approved_amount'] * df_train['requested_sum'].mean() / df_train['approved_amount'].mean())

df_test['requested_sum'] = df_test['requested_sum'].fillna(
            df_test['approved_amount'] * df_test['requested_sum'].mean() / df_test['approved_amount'].mean())

In [25]:
df_train['requested_amount'] = df_train['requested_amount'].fillna(df_train['requested_sum'])
df_test['requested_amount'] =  df_test['requested_amount'].fillna(df_test['requested_sum'])

In [26]:
df_train['req_app_amount'] = df_train['req_app_amount'].fillna(df_train['requested_amount'] - df_train['approved_amount'])
df_test['req_app_amount'] =df_test['req_app_amount'].fillna( df_test['requested_amount'] - df_test['approved_amount'])

In [27]:
simpl_median('requested_period_days')

In [28]:
def simple_moda(name):
    imp_mode = SimpleImputer(strategy="most_frequent")
    df_train[name]=imp_mode.fit_transform(df_train[[name]])
    df_test[name]=imp_mode.transform(df_test[[name]])

In [29]:
simple_moda('tamount')

In [30]:
zerro_simple('work_experience')

In [31]:
min_simple('term')

In [32]:
columns_flat_no_na=['ag',
'approved_amount',
'days_finish_loan',
'interest',
'issues',
'main_agreement_amount',
'main_agreement_term',
'overdue_fee',
'overdue_interest',
'period_days',
'principal']

In [33]:
columns_int = df_train.select_dtypes(include=['int64', 'int32']).columns
columns_int

Index(['payment_frequency', 'status', 'loan_id', 'client_id', 'source',
       'first_source', 'interface', 'type', 'repayment_type', 'client_type',
       'loan_order', 'have_extension', 'created_at', 'start_dt', 'churn',
       'month', 'day', 'week'],
      dtype='object')

- payment_frequency - частота получения зарплаты (month - 1 раз в месяц, 2 weeks - раз в две недели, и тд)
- status - статус клиента (самозанятый, рабочий, и тд)
- source - канал привлечения клиента
- first_source - первый канал привлечения клиента
- interface - интерфейс, откуда пришла заявка - (site, mobile)
- type - тип займа (тип продукта)
- repayment_type - Тип комиссии по займу (с 2.5% - with_comission, 5% - with_big_comission, 0% - no_comission)
- client_type - тип клиента (новый, повторный)
- have_extension - имеется ли пролонгация по данному займу

In [34]:
list_int= ['payment_frequency', 'status','source', 'first_source',
         'interface', 'type', 'repayment_type', 'client_type','have_extension']

In [35]:
client_count=dict(Counter(pd.concat([df_train['client_id'],df_test['client_id']])))

In [36]:
df_train['client_id_new'] = df_train['client_id'].transform(lambda x: client_count.get(x)) 
df_test['client_id_new'] = df_test['client_id'].transform(lambda x: client_count.get(x)) 

In [37]:
client_count=dict(Counter(pd.concat([df_train['loan_order'],df_test['loan_order']])))

In [38]:
df_train['loan_order_new'] = df_train['loan_order'].transform(lambda x: client_count.get(x)) 
df_test['loan_order_new'] = df_test['loan_order'].transform(lambda x: client_count.get(x)) 

### 3. New features

In [39]:
df_train['early_repayment'] = df_train['period_days'] - df_train['days_finish_loan']
df_test['early_repayment'] = df_test['period_days'] - df_test['days_finish_loan']

In [40]:
 df_train['approval_ratio'] = df_train['approved_amount'] / df_train['requested_amount']
 df_test['approval_ratio'] = df_test['approved_amount'] / df_test['requested_amount']

In [41]:
df_train['requsted_income_ratio'] = df_train['requested_amount']/df_train['monthly_income']
df_test['requsted_income_ratio'] = df_test['requested_amount']/df_test['monthly_income']

In [42]:
df_train['total_debt_burden'] = (df_train['elecs_sum'] + df_train['recurents_sum']) / df_train['monthly_income']
df_test['total_debt_burden'] = (df_test['elecs_sum'] + df_test['recurents_sum']) / df_test['monthly_income']

In [43]:
df_train['dti'] = df_train['principal'] / df_train['monthly_income']
df_test['dti'] = df_test['principal']/ df_test['monthly_income']

In [44]:
df_train['requested_approved_ratio'] = df_train['requested_period_days'] / df_train['main_agreement_term']
df_test['requested_approved_ratio'] =  df_test['requested_period_days'] / df_test['main_agreement_term']

In [45]:
df_train[df_train.select_dtypes(include='int64').columns]=df_train.select_dtypes(include='int64').astype('int32')
df_test[df_test.select_dtypes(include='int64').columns]=df_test.select_dtypes(include='int64').astype('int32')

### 4. Data preparation

In [46]:
list_id_total = ['client_id_new', 'loan_order_new']
list_id = ['loan_id']
list_client= ['monthly_income','nbki_score','tamount','work_experience','ag','payment_frequency',
              'status','client_type','settlement_new','region_new','gender','client_type.1' ]

list_loan= ['cnt_ext','elecs_sum','recurents_sum','price','requested_amount',
            'req_app_amount','term','approved_amount','interest',
            'issues','main_agreement_amount','overdue_fee','overdue_interest','principal','type', 
             'repayment_type','have_extension','requested_sum']	
list_new = ['approval_ratio','early_repayment','requested_approved_ratio','requsted_income_ratio','total_debt_burden','dti']

list_support =[ 'contact_cases','source','first_source','interface'] 
list_time = [ 'month','day','week','days_finish_loan', 'period_days', 
             'requested_period_days','main_agreement_term','created_at','days_diff','start_dt']    
# 'closed_at','prolong','period', 'start_dt'->del

# проверка на дубли
total_list =list_id_total +list_loan+list_client+list_support+list_time + list_new
print('Дубли True-нет: ',len(total_list) == len (set(total_list)))
      
# проверка на идентичность структуры train test
print('Идентичность train, test: ',set(df_train.columns) ^set(df_test.columns))
# проверка на полноту  включения столбцов  в списки
print('Полнота столбцов в списке(:',set(df_train.columns) ^set(total_list))     

Дубли True-нет:  True
Идентичность train, test:  {'churn'}
Полнота столбцов в списке(: {'settlement', 'closed_at', 'loan_order', 'client_id', 'start_dt_new', 'loan_id', 'churn', 'region'}


In [47]:
display(missing (df_train[total_list]),missing (df_test[total_list]))

Unnamed: 0,column_name,missing_values


Unnamed: 0,column_name,missing_values


Проведем поиск  столбцов, для которых множества значений в test и train  равны.
Такие  признаки  можно рассматривать как категориальные

In [48]:
lenth_1, lenth_2 ={}, {}
for i in df_test[total_list].columns:
    lenth_1[i]=len(df_train[i].unique())
    lenth_2[i]=len(df_test[i].unique())

discrete_features ={key : value for key, value in lenth_1.items() if lenth_1.get(key) == lenth_2.get(key)}
discrete_features

{'type': 4,
 'repayment_type': 4,
 'have_extension': 2,
 'work_experience': 6,
 'payment_frequency': 8,
 'status': 10,
 'client_type': 2,
 'region_new': 86,
 'gender': 2,
 'client_type.1': 2,
 'source': 20,
 'first_source': 19,
 'interface': 3,
 'month': 12,
 'day': 31,
 'week': 7,
 'period_days': 34,
 'requested_period_days': 26,
 'main_agreement_term': 34}

In [49]:
ordinal_list = list(df_train[total_list].select_dtypes(include='object').columns) 

scale_list=list(set(df_train[total_list].select_dtypes(include=['float','int32']).columns)- set(discrete_features))

passthrough_list = list(set(discrete_features) - set(ordinal_list))

In [50]:
df_train['main_agreement_term']

0         -0.340659
1         -1.145865
2          0.176974
3         -1.145865
4          0.176974
             ...   
4038163   -0.340659
4038164   -0.340659
4038165   -0.340659
4038166   -0.340659
4038167   -0.340659
Name: main_agreement_term, Length: 4038168, dtype: float64

In [51]:
set(total_list)^set(ordinal_list+scale_list+passthrough_list) #  проверка на полноту списков 

{'days_diff'}

In [52]:
len(set(ordinal_list+scale_list+passthrough_list)) == len(ordinal_list+scale_list+passthrough_list)# проверка  на  дубли (True -дублей нет)

True

### 4.  TabNet

In [53]:
from pytorch_tabnet.tab_model import TabNetClassifier

In [54]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


In [55]:
preprocessing =ColumnTransformer([
    ('ord' , OrdinalEncoder( handle_unknown='use_encoded_value', unknown_value=-1),ordinal_list+ passthrough_list),
    # ('pass', 'passthrough', passthrough_list),
    ('scaler', StandardScaler(),scale_list)
    
])

In [56]:
X_train, X_valid, y_train, y_valid = train_test_split(df_train[total_list],df_train['churn'], test_size=0.20, random_state=RANDOM_STATE, stratify=df_train['churn'])

In [57]:
X_train = preprocessing.fit_transform(X_train)
X_valid  = preprocessing.transform(X_valid)
y_train = y_train.to_numpy()
y_valid = y_valid.to_numpy()                          

In [58]:
cat_idxs= list(range (len (ordinal_list+passthrough_list)))
cat_dims = [len(np.unique(np.hstack((X_train[:,i],X_valid[:,i]))))for i in cat_idxs]
print(cat_idxs,'\n',cat_dims)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19] 
 [7001, 86, 2, 2, 26, 6, 4, 19, 31, 34, 10, 7, 34, 3, 20, 2, 2, 4, 8, 12]


In [59]:
cat_idxs= list(range (len (ordinal_list+passthrough_list)))
cat_dims = [len(np.unique(X_valid[:,i]))for i in cat_idxs]
print(cat_idxs,'\n',cat_dims)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19] 
 [7001, 86, 2, 2, 26, 6, 4, 19, 31, 34, 10, 7, 34, 3, 20, 2, 2, 4, 8, 12]


In [60]:
cat_idxs= list(range (len (ordinal_list+passthrough_list)))
cat_dims = [len(np.unique(X_train[:,i]))for i in cat_idxs]
print(cat_idxs,'\n',cat_dims)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19] 
 [7001, 86, 2, 2, 26, 6, 4, 19, 31, 34, 10, 7, 34, 3, 20, 2, 2, 4, 8, 12]


In [61]:
#          [7001, 86, 2, 2, 2, 19, 8, 2, 12, 34, 31, 6, 26, 4, 10, 4, 34, 7, 20, 3] 

emb_size = [500, 40, 2, 2, 2, 10, 4, 2, 6,   17, 15, 3, 13, 2, 5, 2, 17, 3, 10, 2]

In [62]:
[(i.split(sep ='__')[-1] , num) for num, i in enumerate( preprocessing.get_feature_names_out())]

[('settlement_new', 0),
 ('region_new', 1),
 ('gender', 2),
 ('client_type.1', 3),
 ('requested_period_days', 4),
 ('work_experience', 5),
 ('repayment_type', 6),
 ('first_source', 7),
 ('day', 8),
 ('period_days', 9),
 ('status', 10),
 ('week', 11),
 ('main_agreement_term', 12),
 ('interface', 13),
 ('source', 14),
 ('client_type', 15),
 ('have_extension', 16),
 ('type', 17),
 ('payment_frequency', 18),
 ('month', 19),
 ('dti', 20),
 ('elecs_sum', 21),
 ('total_debt_burden', 22),
 ('principal', 23),
 ('req_app_amount', 24),
 ('days_finish_loan', 25),
 ('requsted_income_ratio', 26),
 ('start_dt', 27),
 ('recurents_sum', 28),
 ('client_id_new', 29),
 ('tamount', 30),
 ('cnt_ext', 31),
 ('approved_amount', 32),
 ('requested_amount', 33),
 ('overdue_interest', 34),
 ('interest', 35),
 ('ag', 36),
 ('issues', 37),
 ('overdue_fee', 38),
 ('early_repayment', 39),
 ('requested_approved_ratio', 40),
 ('term', 41),
 ('price', 42),
 ('main_agreement_amount', 43),
 ('created_at', 44),
 ('nbki_sco

In [63]:
 grouped_features =[ [0,1],[6,9,13,14,17,22,25,32,41,46]]

In [64]:
from pytorch_tabnet.metrics import Metric
from sklearn.metrics import f1_score

class F1(Metric):
    def __init__(self):
        self._name = "f1"
        self._maximize = True  

    def __call__(self, y_true, y_score):
        y_pred = (y_score[:, 1] >= 0.5).astype(int)  # Предсказание 0 или 1
        return f1_score(y_true, y_pred)


In [65]:
tabnet_params = {"cat_idxs":cat_idxs,
                 "cat_dims":cat_dims,
                 "cat_emb_dim":emb_size,
                 "optimizer_fn":torch.optim.Adam,
                 "optimizer_params":dict(lr=1e-3),
                 "scheduler_params":{"step_size":15, # how to use learning rate scheduler
                                 "gamma":0.9},
                 "scheduler_fn":torch.optim.lr_scheduler.StepLR,
                 "mask_type":'entmax' # "sparsemax"
                 #"grouped_features" : grouped_features
                }

clf = TabNetClassifier(**tabnet_params
                      )



In [66]:
%%time

# Fitting the model
clf.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    eval_name=['train', 'valid'],
    eval_metric=[F1,'auc', 'accuracy'],
    max_epochs=45 , patience=20,
    batch_size=2048, virtual_batch_size=256,
    num_workers=0,
    weights= 1,
    drop_last=False,
    augmentations=None, #aug, None
)

epoch 0  | loss: 0.55592 | train_f1: 0.54835 | train_auc: 0.85923 | train_accuracy: 0.78772 | valid_f1: 0.55004 | valid_auc: 0.85975 | valid_accuracy: 0.78829 |  0:02:17s
epoch 1  | loss: 0.37231 | train_f1: 0.73819 | train_auc: 0.87849 | train_accuracy: 0.85945 | valid_f1: 0.73925 | valid_auc: 0.87886 | valid_accuracy: 0.85987 |  0:04:38s
epoch 2  | loss: 0.35579 | train_f1: 0.64471 | train_auc: 0.88009 | train_accuracy: 0.82465 | valid_f1: 0.64595 | valid_auc: 0.88034 | valid_accuracy: 0.82509 |  0:06:58s
epoch 3  | loss: 0.34697 | train_f1: 0.22548 | train_auc: 0.7859  | train_accuracy: 0.7079  | valid_f1: 0.2253  | valid_auc: 0.78601 | valid_accuracy: 0.70784 |  0:09:17s
epoch 4  | loss: 0.34131 | train_f1: 0.34257 | train_auc: 0.79384 | train_accuracy: 0.73466 | valid_f1: 0.34358 | valid_auc: 0.79479 | valid_accuracy: 0.73495 |  0:11:36s
epoch 5  | loss: 0.33886 | train_f1: 0.11627 | train_auc: 0.76451 | train_accuracy: 0.68604 | valid_f1: 0.11538 | valid_auc: 0.76458 | valid_accu



CPU times: user 1h 41min 35s, sys: 9min 7s, total: 1h 50min 42s
Wall time: 1h 23min 5s


In [67]:
 clf.save_model("tabnet_model_1")

Successfully saved model at tabnet_model_1.zip


'tabnet_model_1.zip'

In [68]:
pred_1 = clf.predict(X_valid)
score(y_valid,pred_1)

accuracy:  0.8682001995953612
precision:  0.9880670124805931
recall:  0.6111673577559574
f1:  0.7552042609167595
