In [1]:
import pandas as pd
import numpy as np
from typing import defaultdict

In [2]:
data_path = '..\\data\\'

In [3]:
contrib_data = pd.read_csv(data_path + 'train_data_npo\\npo_cntrbtrs.csv')

In [4]:
def columns_report(data: pd.DataFrame) -> None:

    print('Rows:', data.shape[0], end='\n'*2)

    stats = defaultdict(list)
    for col in data.columns:
        stats['Column'].append(col)
        stats['Unique'].append(data[col].nunique())
        stats['Duplicates'].append(data[col].duplicated().sum())
        stats['Missing'].append(data[col].isna().sum())
        stats['Missing%'].append(data[col].isna().mean() * 100)
        stats['HitRate%'].append((1 - data[col].isna().mean()) * 100)
    stats = pd.DataFrame(stats)

    print(stats.to_string(index=False))

In [5]:
contrib_cleaned = contrib_data.copy()

In [6]:
contrib_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248947 entries, 0 to 248946
Data columns (total 12 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   npo_accnt_id           248947 non-null  object 
 1   clnt_id                248947 non-null  object 
 2   accnt_pnsn_schm        248866 non-null  float64
 3   slctn_nmbr             248947 non-null  int64  
 4   npo_accnt_status       248947 non-null  int64  
 5   npo_accnt_status_date  248947 non-null  object 
 6   npo_blnc               119305 non-null  float64
 7   npo_pmnts_sum          233181 non-null  float64
 8   npo_pmnts_nmbr         233181 non-null  float64
 9   npo_frst_pmnt_date     233181 non-null  object 
 10  npo_lst_pmnt_date      233181 non-null  object 
 11  npo_ttl_incm           235141 non-null  float64
dtypes: float64(5), int64(2), object(5)
memory usage: 22.8+ MB


In [7]:
columns_report(contrib_cleaned)

Rows: 248947

               Column  Unique  Duplicates  Missing  Missing%   HitRate%
         npo_accnt_id  248947           0        0  0.000000 100.000000
              clnt_id  230426       18521        0  0.000000 100.000000
      accnt_pnsn_schm     357      248589       81  0.032537  99.967463
           slctn_nmbr       4      248943        0  0.000000 100.000000
     npo_accnt_status       2      248945        0  0.000000 100.000000
npo_accnt_status_date    8031      240916        0  0.000000 100.000000
             npo_blnc   94189      154757   129642 52.076145  47.923855
        npo_pmnts_sum  157074       91872    15766  6.333075  93.666925
       npo_pmnts_nmbr     233      248713    15766  6.333075  93.666925
   npo_frst_pmnt_date    5802      243144    15766  6.333075  93.666925
    npo_lst_pmnt_date    6178      242768    15766  6.333075  93.666925
         npo_ttl_incm  189873       59073    13806  5.545759  94.454241


In [8]:
contrib_cleaned['accnt_pnsn_schm'] = contrib_cleaned.accnt_pnsn_schm.astype(float).fillna(
    contrib_cleaned.accnt_pnsn_schm.mode()[0]
    ).astype(int).astype('category')

In [9]:
contrib_cleaned[['npo_blnc', 'npo_pmnts_sum']].corr()

Unnamed: 0,npo_blnc,npo_pmnts_sum
npo_blnc,1.0,0.999829
npo_pmnts_sum,0.999829,1.0


In [10]:
blnc_filter = (((contrib_cleaned.npo_blnc.isna()) | (contrib_cleaned.npo_blnc < 0)) & (contrib_cleaned.npo_accnt_status == 1))
contrib_cleaned.loc[blnc_filter, 'npo_blnc'] = contrib_cleaned.loc[blnc_filter, 'npo_pmnts_sum'].fillna(0)
contrib_cleaned.loc[contrib_cleaned.npo_accnt_status == 0, 'npo_blnc'] = np.nan
contrib_cleaned.loc[contrib_cleaned.npo_pmnts_sum < 0, 'npo_pmnts_sum'] = np.nan
contrib_cleaned.loc[contrib_cleaned.npo_ttl_incm < 0, 'npo_ttl_incm'] = 0
contrib_cleaned['npo_ttl_incm'] = contrib_cleaned.npo_ttl_incm.fillna(0)

In [11]:
contrib_cleaned.dropna(subset=['npo_pmnts_sum'], inplace=True)
contrib_cleaned.reset_index(drop=True, inplace=True)

In [12]:
columns_report(contrib_cleaned)

Rows: 233166

               Column  Unique  Duplicates  Missing  Missing%   HitRate%
         npo_accnt_id  233166           0        0  0.000000 100.000000
              clnt_id  220745       12421        0  0.000000 100.000000
      accnt_pnsn_schm     333      232833        0  0.000000 100.000000
           slctn_nmbr       4      233162        0  0.000000 100.000000
     npo_accnt_status       2      233164        0  0.000000 100.000000
npo_accnt_status_date    7954      225212        0  0.000000 100.000000
             npo_blnc   80045      153120   131566 56.425894  43.574106
        npo_pmnts_sum  157070       76096        0  0.000000 100.000000
       npo_pmnts_nmbr     233      232933        0  0.000000 100.000000
   npo_frst_pmnt_date    5802      227364        0  0.000000 100.000000
    npo_lst_pmnt_date    6178      226988        0  0.000000 100.000000
         npo_ttl_incm  184882       48284        0  0.000000 100.000000


In [13]:
contrib_cleaned.describe(percentiles=[.01, .1, .25, .5, .75, .9, .99])

Unnamed: 0,slctn_nmbr,npo_accnt_status,npo_blnc,npo_pmnts_sum,npo_pmnts_nmbr,npo_ttl_incm
count,233166.0,233166.0,101600.0,233166.0,233166.0,233166.0
mean,1.429136,0.435741,342705.7,171546.0,39.978543,34892.96
std,0.837221,0.495855,69126090.0,40554150.0,45.285748,5172632.0
min,0.0,0.0,0.0,-0.0,1.0,0.0
1%,0.0,0.0,49.27,20.42,1.0,0.0
10%,0.0,0.0,310.37,126.9,1.0,92.11
25%,1.0,0.0,1483.6,2607.762,4.0,609.8825
50%,2.0,0.0,14252.65,18821.5,24.0,3631.965
75%,2.0,1.0,105436.1,75304.84,62.0,15439.01
90%,2.0,1.0,284990.8,193581.7,103.0,44707.99


In [14]:
# contrib_cleaned.to_feather(data_path + '\\interim\\contrib.frt')

  if _pandas_api.is_sparse(col):
