# 널 값에 대해 체크

## 모듈 임포트

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc

## 데이터 로드

In [2]:
path = './data/'

In [3]:
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')
history = pd.read_csv(path + 'historical_transactions.csv')
new_history = pd.read_csv(path + 'new_merchant_transactions.csv')
merchant = pd.read_csv(path + 'merchants.csv')

## Null 확인

In [4]:
def check_null(data):
    result = {}
    
    # 널 개수
    temp = []    
    for v in data.columns:
        temp.append(sum(data[v].isnull() * 1))
    result['is_null_cnt'] = temp.copy()
    
    # 널 비율
    temp = []
    for v in data.columns:
        temp.append(sum(data[v].isnull() * 1) / data.shape[0])
    result['is_null_rate'] = temp.copy()
    
    
    return pd.DataFrame(result, index=data.columns)

In [69]:
print(check_null(train))

                    is_null_cnt  is_null_rate
first_active_month            0           0.0
card_id                       0           0.0
feature_1                     0           0.0
feature_2                     0           0.0
feature_3                     0           0.0
target                        0           0.0


In [70]:
print(check_null(test))

                    is_null_cnt  is_null_rate
first_active_month            1      0.000008
card_id                       0      0.000000
feature_1                     0      0.000000
feature_2                     0      0.000000
feature_3                     0      0.000000


In [71]:
print(check_null(history))

                      is_null_cnt  is_null_rate
authorized_flag                 0      0.000000
card_id                         0      0.000000
city_id                         0      0.000000
category_1                      0      0.000000
installments                    0      0.000000
category_3                 178159      0.006120
merchant_category_id            0      0.000000
merchant_id                138481      0.004757
month_lag                       0      0.000000
purchase_amount                 0      0.000000
purchase_date                   0      0.000000
category_2                2652864      0.091125
state_id                        0      0.000000
subsector_id                    0      0.000000


In [72]:
print(check_null(new_history))

                      is_null_cnt  is_null_rate
authorized_flag                 0      0.000000
card_id                         0      0.000000
city_id                         0      0.000000
category_1                      0      0.000000
installments                    0      0.000000
category_3                  55922      0.028488
merchant_category_id            0      0.000000
merchant_id                 26216      0.013355
month_lag                       0      0.000000
purchase_amount                 0      0.000000
purchase_date                   0      0.000000
category_2                 111745      0.056925
state_id                        0      0.000000
subsector_id                    0      0.000000


In [73]:
print(check_null(merchant))

                             is_null_cnt  is_null_rate
merchant_id                            0      0.000000
merchant_group_id                      0      0.000000
merchant_category_id                   0      0.000000
subsector_id                           0      0.000000
numerical_1                            0      0.000000
numerical_2                            0      0.000000
category_1                             0      0.000000
most_recent_sales_range                0      0.000000
most_recent_purchases_range            0      0.000000
avg_sales_lag3                        13      0.000039
avg_purchases_lag3                     0      0.000000
active_months_lag3                     0      0.000000
avg_sales_lag6                        13      0.000039
avg_purchases_lag6                     0      0.000000
active_months_lag6                     0      0.000000
avg_sales_lag12                       13      0.000039
avg_purchases_lag12                    0      0.000000
active_mon

### history에서 merchant_id가 null인 친구

In [112]:
card_id = history.iloc[history.merchant_id.dropna().index].card_id.unique()

In [117]:
set(train.card_id.unique()).issubset(set(card_id))

True

In [118]:
set(test.card_id.unique()).issubset(set(card_id))

True

* history에서 merchant_id가 null인 친구를 드랍해도 card_id에 공백은 없음

## First Active Month

### first active null checking

In [44]:
test[test.first_active_month.isna()]

Unnamed: 0,first_active_month,card_id,purchase_date,is_equal
11578,,C_ID_c27b4f80f7,2017-03,0


In [12]:
first_purchase_date = history.groupby('card_id').purchase_date.min()
first_purchase_date = first_purchase_date.to_frame().reset_index()

In [23]:
train = train.merge(first_purchase_date, on='card_id', how='left')[['first_active_month', 'card_id', 'purchase_date']]
test = test.merge(first_purchase_date, on='card_id', how='left')[['first_active_month', 'card_id', 'purchase_date']]

In [48]:
test[test.first_active_month.isna()]

Unnamed: 0,first_active_month,card_id,purchase_date,is_equal
11578,,C_ID_c27b4f80f7,2017-03,0


In [32]:
test.purchase_date = test.purchase_date.str[:7]

In [36]:
def equal(data):
    if data.first_active_month == data.purchase_date:
        return 1
    else:
        return 0

In [38]:
test['is_equal'] = test.apply(equal, axis=1)

In [40]:
test.is_equal.value_counts()

1    76524
0    47099
Name: is_equal, dtype: int64

In [42]:
test[test.is_equal == 0]

Unnamed: 0,first_active_month,card_id,purchase_date,is_equal
4,2015-12,C_ID_2b5e3df5c2,2017-01,0
9,2016-03,C_ID_6d8dba8475,2017-01,0
12,2016-04,C_ID_8375dfdaa2,2017-01,0
13,2016-12,C_ID_79d149c008,2017-01,0
14,2016-08,C_ID_7f1041e8e1,2017-01,0
18,2017-11,C_ID_e457bbd71d,2017-10,0
20,2016-10,C_ID_48243da9c4,2017-01,0
24,2016-05,C_ID_b54cfad8b2,2017-02,0
25,2013-03,C_ID_4b82e26faa,2017-01,0
26,2016-04,C_ID_60a453531a,2017-01,0


In [46]:
test[test.purchase_date == '2017-03']

Unnamed: 0,first_active_month,card_id,purchase_date,is_equal
10,2017-03,C_ID_4859ac9ed5,2017-03,1
34,2017-03,C_ID_b00860b6a5,2017-03,1
38,2017-03,C_ID_8eaa79db4f,2017-03,1
42,2017-03,C_ID_66715f7d2b,2017-03,1
46,2017-03,C_ID_57940f34ad,2017-03,1
72,2016-11,C_ID_562a791678,2017-03,0
102,2017-03,C_ID_403d457c52,2017-03,1
170,2017-03,C_ID_5ac4488f3c,2017-03,1
212,2017-03,C_ID_415fc459f1,2017-03,1
216,2016-08,C_ID_524851647a,2017-03,0


In [43]:
test.purchase_date.value_counts()

2017-01    42296
2017-09     8836
2017-02     8726
2017-08     8449
2017-10     8330
2017-11     7780
2017-07     7589
2017-03     6711
2017-06     6493
2017-04     6270
2017-05     6139
2017-12     6004
Name: purchase_date, dtype: int64

In [51]:
sorted(train.first_active_month.value_counts().index)

['2011-11',
 '2011-12',
 '2012-02',
 '2012-03',
 '2012-04',
 '2012-05',
 '2012-06',
 '2012-07',
 '2012-08',
 '2012-09',
 '2012-10',
 '2012-11',
 '2012-12',
 '2013-01',
 '2013-02',
 '2013-03',
 '2013-04',
 '2013-05',
 '2013-06',
 '2013-07',
 '2013-08',
 '2013-09',
 '2013-10',
 '2013-11',
 '2013-12',
 '2014-01',
 '2014-02',
 '2014-03',
 '2014-04',
 '2014-05',
 '2014-06',
 '2014-07',
 '2014-08',
 '2014-09',
 '2014-10',
 '2014-11',
 '2014-12',
 '2015-01',
 '2015-02',
 '2015-03',
 '2015-04',
 '2015-05',
 '2015-06',
 '2015-07',
 '2015-08',
 '2015-09',
 '2015-10',
 '2015-11',
 '2015-12',
 '2016-01',
 '2016-02',
 '2016-03',
 '2016-04',
 '2016-05',
 '2016-06',
 '2016-07',
 '2016-08',
 '2016-09',
 '2016-10',
 '2016-11',
 '2016-12',
 '2017-01',
 '2017-02',
 '2017-03',
 '2017-04',
 '2017-05',
 '2017-06',
 '2017-07',
 '2017-08',
 '2017-09',
 '2017-10',
 '2017-11',
 '2017-12',
 '2018-01',
 '2018-02']

In [57]:
sorted(new_history.purchase_date.str[:7].unique())

['2017-03',
 '2017-04',
 '2017-05',
 '2017-06',
 '2017-07',
 '2017-08',
 '2017-09',
 '2017-10',
 '2017-11',
 '2017-12',
 '2018-01',
 '2018-02',
 '2018-03',
 '2018-04']

## Null check new

In [9]:
history.city_id.replace(-1, np.nan, inplace=True)
history.installments.replace(-1, np.nan, inplace=True)
history.installments.replace(999, np.nan, inplace=True)
history.merchant_category_id.replace(-1, np.nan, inplace=True)
history.subsector_id.replace(-1, np.nan, inplace=True)
history.state_id.replace(-1, np.nan, inplace=True)

In [10]:
check_null(history)

Unnamed: 0,is_null_cnt,is_null_rate
authorized_flag,0,0.0
card_id,0,0.0
city_id,2478624,0.08514
category_1,0,0.0
installments,178347,0.006126
category_3,178159,0.00612
merchant_category_id,2252,7.7e-05
merchant_id,138481,0.004757
month_lag,0,0.0
purchase_amount,0,0.0


In [12]:
3 / 180

0.016666666666666666

In [13]:
2207 / len(train)

0.010930233709890698