In [1]:
import numpy as np  # For matrix operations and numerical processing
import pandas as pd  # For munging tabular data
import sklearn as sk  # For access to a variety of machine learning models
import matplotlib.pyplot as plt  # For charts and visualizations
from time import gmtime, strftime  # For labeling SageMaker models, endpoints, etc.
import sys  # For writing outputs to notebook

In [11]:
def first_look(df):
    """Display basic information about the dataframe"""
    
    display(df.info())
    display(df.head(5))
    display(df.describe(include='all'))
    display('Доля пустых строк:', df.isna().mean()>0)
    display('Количество дублирующихся строк:', df.duplicated().sum())

In [3]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [6]:
train = pd.read_csv("train_dataset_hackathon_mkb.csv",encoding='cp1251', sep=';')

In [12]:
first_look(train)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17891 entries, 0 to 17890
Columns: 124 entries, id_contract to CITIZENSHIP_NAME
dtypes: float64(108), int64(4), object(12)
memory usage: 16.9+ MB


None

Unnamed: 0,id_contract,id_client,SIGN_DATE,IP_flag,TARGET,F1100,F1110,F1150,F1160,F1170,...,WINNERNUMBER_95_EVER,SIGNEDNUMBER_95_EVER,SUM_95_EVER,FLAG_DISQUALIFICATION,COUNT_CHANGE_YEAR,COUNT_CHANGE_EVER,BIRTHDATE,AGE,SEX_NAME,CITIZENSHIP_NAME
0,1,1847,01JAN2018:00:00:00,0,0,1298961000.0,2154000.0,1125573000.0,,150010000.0,...,,,,,,,,,,
1,2,4650,01JAN2018:00:00:00,1,0,,,,,,...,,,,,,,,,,
2,3,4770,01JAN2018:00:00:00,0,0,73374000.0,,73374000.0,,,...,169.0,168.0,18351739.0,,,1.0,,,,
3,4,12237,01JAN2018:00:00:00,0,0,1937488000.0,122828000.0,610328000.0,,809426000.0,...,,,,,,,,,,
4,5,9988,01JAN2018:00:00:00,1,0,,,,,,...,,,,,,,,,,


Unnamed: 0,id_contract,id_client,SIGN_DATE,IP_flag,TARGET,F1100,F1110,F1150,F1160,F1170,...,WINNERNUMBER_95_EVER,SIGNEDNUMBER_95_EVER,SUM_95_EVER,FLAG_DISQUALIFICATION,COUNT_CHANGE_YEAR,COUNT_CHANGE_EVER,BIRTHDATE,AGE,SEX_NAME,CITIZENSHIP_NAME
count,17891.0,17891.0,17891,17891.0,17891.0,6936.0,1420.0,6341.0,346.0,1499.0,...,6784.0,6784.0,6784.0,5.0,239.0,1509.0,1419,1419.0,2168,2128
unique,,,439,,,,,,,,...,,,,,,,844,,2,4
top,,,09JAN2019:00:00:00,,,,,,,,...,,,,,,,05MAR1987:00:00:00,,мужской,Российская Федерация
freq,,,198,,,,,,,,...,,,,,,,125,,1265,2124
mean,8946.0,6630.652786,,0.195629,0.413001,3184582000.0,113635700.0,2287078000.0,49817800.0,3389027000.0,...,354.557341,323.424823,254479000.0,1.0,1.502092,2.325381,,41.916843,,
std,5164.831169,3659.132885,,0.396695,0.492387,30621840000.0,941383200.0,20162910000.0,256143300.0,20767660000.0,...,1339.773863,1258.815709,1140965000.0,0.0,1.511509,2.316937,,10.769157,,
min,1.0,1.0,,0.0,0.0,-182000.0,1000.0,1000.0,-1175000.0,1000.0,...,0.0,0.0,0.0,1.0,1.0,1.0,,21.0,,
25%,4473.5,3452.0,,0.0,0.0,433500.0,73000.0,718000.0,3694000.0,113000.0,...,17.0,12.0,4296130.0,1.0,1.0,1.0,,32.0,,
50%,8946.0,6560.0,,0.0,0.0,4308000.0,276000.0,4984000.0,3694000.0,7239000.0,...,75.0,58.0,18657070.0,1.0,1.0,2.0,,40.0,,
75%,13418.5,10080.0,,0.0,1.0,66616500.0,4409500.0,55944000.0,11956500.0,148500000.0,...,219.25,176.25,93590970.0,1.0,2.0,2.0,,49.0,,


'Доля пустых строк:'

id_contract          False
id_client            False
SIGN_DATE            False
IP_flag              False
TARGET               False
                     ...  
COUNT_CHANGE_EVER     True
BIRTHDATE             True
AGE                   True
SEX_NAME              True
CITIZENSHIP_NAME      True
Length: 124, dtype: bool

'Количество дублирующихся строк:'

0

In [9]:
test = pd.read_csv("test_dataset_hackathon_mkb.csv",encoding='cp1251', sep=';')

In [10]:
first_look(test)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7330 entries, 0 to 7329
Columns: 123 entries, id_contract to CITIZENSHIP_NAME
dtypes: float64(108), int64(3), object(12)
memory usage: 6.9+ MB


None

Unnamed: 0,id_contract,id_client,SIGN_DATE,IP_flag,F1100,F1110,F1150,F1160,F1170,F1180,...,WINNERNUMBER_95_EVER,SIGNEDNUMBER_95_EVER,SUM_95_EVER,FLAG_DISQUALIFICATION,COUNT_CHANGE_YEAR,COUNT_CHANGE_EVER,BIRTHDATE,AGE,SEX_NAME,CITIZENSHIP_NAME
0,17892,3620,08APR2019:00:00:00,1,,,,,,,...,,,,,,,,,,
1,17893,4101,08APR2019:00:00:00,0,1906000.0,,1906000.0,,,,...,7439.0,7100.0,1187411000.0,,,,,,,
2,17894,9589,08APR2019:00:00:00,0,147000.0,,147000.0,,,,...,362.0,344.0,98362600.0,,1.0,3.0,,,,
3,17895,11546,08APR2019:00:00:00,0,,,,,,,...,,,,,,,,,,
4,17896,12558,08APR2019:00:00:00,0,26000.0,,,,26000.0,,...,,,,,,1.0,,,,


Unnamed: 0,id_contract,id_client,SIGN_DATE,IP_flag,F1100,F1110,F1150,F1160,F1170,F1180,...,WINNERNUMBER_95_EVER,SIGNEDNUMBER_95_EVER,SUM_95_EVER,FLAG_DISQUALIFICATION,COUNT_CHANGE_YEAR,COUNT_CHANGE_EVER,BIRTHDATE,AGE,SEX_NAME,CITIZENSHIP_NAME
count,7330.0,7330.0,7330,7330.0,2490.0,458.0,2321.0,79.0,486.0,679.0,...,2306.0,2306.0,2306.0,5.0,79.0,487.0,615,615.0,1434,1431
unique,,,248,,,,,,,,...,,,,,,,485,,2,4
top,,,27MAY2019:00:00:00,,,,,,,,...,,,,,,,01SEP1986:00:00:00,,мужской,Российская Федерация
freq,,,112,,,,,,,,...,,,,,,,21,,858,1427
mean,21556.5,6625.300819,,0.243247,1173428000.0,68750550.0,729051000.0,43164180.0,1604195000.0,95306860.0,...,232.701214,221.14961,228839900.0,1.0,1.139241,2.433265,,43.427642,,
std,2116.133069,3580.337029,,0.429072,17364080000.0,645343400.0,9419810000.0,263364100.0,13983500000.0,418146200.0,...,710.966033,691.959633,1430015000.0,0.0,0.63517,2.371635,,11.080438,,
min,17892.0,4.0,,0.0,474.0,1000.0,474.0,6000.0,1000.0,-183000.0,...,0.0,0.0,0.0,1.0,1.0,1.0,,23.0,,
25%,19724.25,3778.25,,0.0,365250.0,61500.0,426000.0,1167000.0,62000.0,172000.0,...,10.0,9.0,3528744.0,1.0,1.0,1.0,,34.0,,
50%,21556.5,6758.5,,0.0,3014000.0,436000.0,3039000.0,3592000.0,880500.0,1277000.0,...,50.0,44.0,18132080.0,1.0,1.0,1.0,,42.0,,
75%,23388.75,9801.25,,0.0,30515250.0,20154250.0,20428000.0,8238000.0,65002000.0,14851500.0,...,153.0,146.0,71836480.0,1.0,1.0,3.0,,51.0,,


'Доля пустых строк:'

id_contract          0.000000
id_client            0.000000
SIGN_DATE            0.000000
IP_flag              0.000000
F1100                0.660300
                       ...   
COUNT_CHANGE_EVER    0.933561
BIRTHDATE            0.916098
AGE                  0.916098
SEX_NAME             0.804366
CITIZENSHIP_NAME     0.804775
Length: 123, dtype: float64

'Количество дублирующихся строк:'

0