In [1]:
import numpy as np  # For matrix operations and numerical processing
import pandas as pd  # For munging tabular data
import sklearn as sk  # For access to a variety of machine learning models
import matplotlib.pyplot as plt  # For charts and visualizations
from time import gmtime, strftime  # For labeling SageMaker models, endpoints, etc.
import sys  # For writing outputs to notebook

In [2]:
import catboost as cb
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

In [3]:
def first_look(df):
    """Display basic information about the dataframe"""
    df.columns = map(str.lower, df.columns)
    
    display(df.info())
    display(df.head(5))
    display(df.describe(include='all'))
    display('Доля пустых строк:', df.isna().mean()[df.isna().mean().values > 0])
    display('Количество дублирующихся строк:', df.duplicated().sum())
    

In [4]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [5]:
train = pd.read_csv("train_dataset_hackathon_mkb.csv",encoding='cp1251', sep=';')

In [6]:
first_look(train)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17891 entries, 0 to 17890
Columns: 124 entries, id_contract to citizenship_name
dtypes: float64(108), int64(4), object(12)
memory usage: 16.9+ MB


None

Unnamed: 0,id_contract,id_client,sign_date,ip_flag,target,f1100,f1110,f1150,f1160,f1170,...,winnernumber_95_ever,signednumber_95_ever,sum_95_ever,flag_disqualification,count_change_year,count_change_ever,birthdate,age,sex_name,citizenship_name
0,1,1847,01JAN2018:00:00:00,0,0,1298961000.0,2154000.0,1125573000.0,,150010000.0,...,,,,,,,,,,
1,2,4650,01JAN2018:00:00:00,1,0,,,,,,...,,,,,,,,,,
2,3,4770,01JAN2018:00:00:00,0,0,73374000.0,,73374000.0,,,...,169.0,168.0,18351739.0,,,1.0,,,,
3,4,12237,01JAN2018:00:00:00,0,0,1937488000.0,122828000.0,610328000.0,,809426000.0,...,,,,,,,,,,
4,5,9988,01JAN2018:00:00:00,1,0,,,,,,...,,,,,,,,,,


Unnamed: 0,id_contract,id_client,sign_date,ip_flag,target,f1100,f1110,f1150,f1160,f1170,...,winnernumber_95_ever,signednumber_95_ever,sum_95_ever,flag_disqualification,count_change_year,count_change_ever,birthdate,age,sex_name,citizenship_name
count,17891.0,17891.0,17891,17891.0,17891.0,6936.0,1420.0,6341.0,346.0,1499.0,...,6784.0,6784.0,6784.0,5.0,239.0,1509.0,1419,1419.0,2168,2128
unique,,,439,,,,,,,,...,,,,,,,844,,2,4
top,,,09JAN2019:00:00:00,,,,,,,,...,,,,,,,05MAR1987:00:00:00,,мужской,Российская Федерация
freq,,,198,,,,,,,,...,,,,,,,125,,1265,2124
mean,8946.0,6630.652786,,0.195629,0.413001,3184582000.0,113635700.0,2287078000.0,49817800.0,3389027000.0,...,354.557341,323.424823,254479000.0,1.0,1.502092,2.325381,,41.916843,,
std,5164.831169,3659.132885,,0.396695,0.492387,30621840000.0,941383200.0,20162910000.0,256143300.0,20767660000.0,...,1339.773863,1258.815709,1140965000.0,0.0,1.511509,2.316937,,10.769157,,
min,1.0,1.0,,0.0,0.0,-182000.0,1000.0,1000.0,-1175000.0,1000.0,...,0.0,0.0,0.0,1.0,1.0,1.0,,21.0,,
25%,4473.5,3452.0,,0.0,0.0,433500.0,73000.0,718000.0,3694000.0,113000.0,...,17.0,12.0,4296130.0,1.0,1.0,1.0,,32.0,,
50%,8946.0,6560.0,,0.0,0.0,4308000.0,276000.0,4984000.0,3694000.0,7239000.0,...,75.0,58.0,18657070.0,1.0,1.0,2.0,,40.0,,
75%,13418.5,10080.0,,0.0,1.0,66616500.0,4409500.0,55944000.0,11956500.0,148500000.0,...,219.25,176.25,93590970.0,1.0,2.0,2.0,,49.0,,


'Доля пустых строк:'

f1100                0.612319
f1110                0.920630
f1150                0.645576
f1160                0.980661
f1170                0.916215
                       ...   
count_change_ever    0.915656
birthdate            0.920686
age                  0.920686
sex_name             0.878822
citizenship_name     0.881058
Length: 119, dtype: float64

'Количество дублирующихся строк:'

0

In [7]:
train.isna().mean()[train.isna().mean().values > 0.9]

f1110                    0.920630
f1160                    0.980661
f1170                    0.916215
f1190                    0.915041
f1320                    0.994578
f1350                    0.946454
f1360                    0.923481
f1450                    0.959141
f1530                    0.957968
f1550                    0.949639
f2310                    0.983176
flag_disqualification    0.999721
count_change_year        0.986641
count_change_ever        0.915656
birthdate                0.920686
age                      0.920686
dtype: float64

In [8]:
train.isna().mean()[train.isna().mean().values == 0]

id_contract    0.0
id_client      0.0
sign_date      0.0
ip_flag        0.0
target         0.0
dtype: float64

Столбцы birthdate, age, sex_name, citizenship_name удалим из за большого количества пропусков, остальные пропуски заполним 0.

In [9]:
train.drop(['birthdate', 'age', 'sex_name', 'citizenship_name','datefirstreg','taxreg_regdate', 'taxregpay_regdate'], axis = 1, inplace=True)

In [10]:
train['sign_date'] = pd.to_datetime(train['sign_date'], format='%d%b%Y:%H:%M:%S')

In [11]:
# train[['sign_date','datefirstreg','taxreg_regdate', 'taxregpay_regdate']] = train[['sign_date','datefirstreg','taxreg_regdate', 'taxregpay_regdate']].apply(pd.to_datetime, format='%d%b%Y:%H:%M:%S')

In [12]:
# train[['sign_date','datefirstreg','taxreg_regdate', 'taxregpay_regdate']].isna().sum()

In [13]:
train = train.fillna(0)

In [14]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17891 entries, 0 to 17890
Columns: 117 entries, id_contract to count_change_ever
dtypes: datetime64[ns](1), float64(107), int64(4), object(5)
memory usage: 16.0+ MB


In [15]:
test = pd.read_csv("test_dataset_hackathon_mkb.csv",encoding='cp1251', sep=';')

In [16]:
first_look(test)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7330 entries, 0 to 7329
Columns: 123 entries, id_contract to citizenship_name
dtypes: float64(108), int64(3), object(12)
memory usage: 6.9+ MB


None

Unnamed: 0,id_contract,id_client,sign_date,ip_flag,f1100,f1110,f1150,f1160,f1170,f1180,...,winnernumber_95_ever,signednumber_95_ever,sum_95_ever,flag_disqualification,count_change_year,count_change_ever,birthdate,age,sex_name,citizenship_name
0,17892,3620,08APR2019:00:00:00,1,,,,,,,...,,,,,,,,,,
1,17893,4101,08APR2019:00:00:00,0,1906000.0,,1906000.0,,,,...,7439.0,7100.0,1187411000.0,,,,,,,
2,17894,9589,08APR2019:00:00:00,0,147000.0,,147000.0,,,,...,362.0,344.0,98362600.0,,1.0,3.0,,,,
3,17895,11546,08APR2019:00:00:00,0,,,,,,,...,,,,,,,,,,
4,17896,12558,08APR2019:00:00:00,0,26000.0,,,,26000.0,,...,,,,,,1.0,,,,


Unnamed: 0,id_contract,id_client,sign_date,ip_flag,f1100,f1110,f1150,f1160,f1170,f1180,...,winnernumber_95_ever,signednumber_95_ever,sum_95_ever,flag_disqualification,count_change_year,count_change_ever,birthdate,age,sex_name,citizenship_name
count,7330.0,7330.0,7330,7330.0,2490.0,458.0,2321.0,79.0,486.0,679.0,...,2306.0,2306.0,2306.0,5.0,79.0,487.0,615,615.0,1434,1431
unique,,,248,,,,,,,,...,,,,,,,485,,2,4
top,,,27MAY2019:00:00:00,,,,,,,,...,,,,,,,01SEP1986:00:00:00,,мужской,Российская Федерация
freq,,,112,,,,,,,,...,,,,,,,21,,858,1427
mean,21556.5,6625.300819,,0.243247,1173428000.0,68750550.0,729051000.0,43164180.0,1604195000.0,95306860.0,...,232.701214,221.14961,228839900.0,1.0,1.139241,2.433265,,43.427642,,
std,2116.133069,3580.337029,,0.429072,17364080000.0,645343400.0,9419810000.0,263364100.0,13983500000.0,418146200.0,...,710.966033,691.959633,1430015000.0,0.0,0.63517,2.371635,,11.080438,,
min,17892.0,4.0,,0.0,474.0,1000.0,474.0,6000.0,1000.0,-183000.0,...,0.0,0.0,0.0,1.0,1.0,1.0,,23.0,,
25%,19724.25,3778.25,,0.0,365250.0,61500.0,426000.0,1167000.0,62000.0,172000.0,...,10.0,9.0,3528744.0,1.0,1.0,1.0,,34.0,,
50%,21556.5,6758.5,,0.0,3014000.0,436000.0,3039000.0,3592000.0,880500.0,1277000.0,...,50.0,44.0,18132080.0,1.0,1.0,1.0,,42.0,,
75%,23388.75,9801.25,,0.0,30515250.0,20154250.0,20428000.0,8238000.0,65002000.0,14851500.0,...,153.0,146.0,71836480.0,1.0,1.0,3.0,,51.0,,


'Доля пустых строк:'

f1100                0.660300
f1110                0.937517
f1150                0.683356
f1160                0.989222
f1170                0.933697
                       ...   
count_change_ever    0.933561
birthdate            0.916098
age                  0.916098
sex_name             0.804366
citizenship_name     0.804775
Length: 119, dtype: float64

'Количество дублирующихся строк:'

0

In [17]:
test.drop(['birthdate', 'age', 'sex_name', 'citizenship_name','datefirstreg','taxreg_regdate', 'taxregpay_regdate'], axis = 1, inplace=True)

In [18]:
test['sign_date'] = pd.to_datetime(test['sign_date'], format='%d%b%Y:%H:%M:%S')

In [19]:
# test[['sign_date','datefirstreg','taxreg_regdate', 'taxregpay_regdate']] = test[['sign_date','datefirstreg','taxreg_regdate', 'taxregpay_regdate']].apply(pd.to_datetime, format='%d%b%Y:%H:%M:%S')

In [20]:
test = test.fillna(0)

In [21]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7330 entries, 0 to 7329
Columns: 116 entries, id_contract to count_change_ever
dtypes: datetime64[ns](1), float64(107), int64(3), object(5)
memory usage: 6.5+ MB


In [22]:
X = train.drop(['target'], axis = 1)
y = train.loc[:, 'target']

print(X.shape, y.shape)

(17891, 116) (17891,)


In [23]:
# Splitting X,y into train and validation data 
x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state = 42)

In [24]:
model = CatBoostClassifier(iterations = 5000,
                          eval_metric='Accuracy',
                          od_wait=500,
                          task_type='GPU',
                         )
model.fit(x_train, y_train,
         eval_set=(x_valid, y_valid),
         verbose_eval=100,
         use_best_model=True,
         plot=True
         )

CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=64]="Частная собственность": Cannot convert 'b'\xd0\xa7\xd0\xb0\xd1\x81\xd1\x82\xd0\xbd\xd0\xb0\xd1\x8f \xd1\x81\xd0\xbe\xd0\xb1\xd1\x81\xd1\x82\xd0\xb2\xd0\xb5\xd0\xbd\xd0\xbd\xd0\xbe\xd1\x81\xd1\x82\xd1\x8c'' to float