In [1]:
import pandas as pd
import numpy as np
from typing import defaultdict

In [2]:
data_path = '..\\data\\'

In [3]:
contrib_data = pd.read_csv(data_path + 'test_data_npo\\npo_cntrbtrs.csv')

In [4]:
def columns_report(data: pd.DataFrame) -> None:

    print('Rows:', data.shape[0], end='\n'*2)

    stats = defaultdict(list)
    for col in data.columns:
        stats['Column'].append(col)
        stats['Unique'].append(data[col].nunique())
        stats['Duplicates'].append(data[col].duplicated().sum())
        stats['Missing'].append(data[col].isna().sum())
        stats['Missing%'].append(data[col].isna().mean() * 100)
        stats['HitRate%'].append((1 - data[col].isna().mean()) * 100)
    stats = pd.DataFrame(stats)

    print(stats.to_string(index=False))

In [5]:
contrib_cleaned = contrib_data.copy()

In [6]:
contrib_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106692 entries, 0 to 106691
Data columns (total 12 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   npo_accnt_id           106692 non-null  object 
 1   clnt_id                106692 non-null  object 
 2   accnt_pnsn_schm        106655 non-null  float64
 3   slctn_nmbr             106692 non-null  int64  
 4   npo_accnt_status       106692 non-null  int64  
 5   npo_accnt_status_date  106692 non-null  object 
 6   npo_blnc               51287 non-null   float64
 7   npo_pmnts_sum          100009 non-null  float64
 8   npo_pmnts_nmbr         100009 non-null  float64
 9   npo_frst_pmnt_date     100009 non-null  object 
 10  npo_lst_pmnt_date      100009 non-null  object 
 11  npo_ttl_incm           100888 non-null  float64
dtypes: float64(5), int64(2), object(5)
memory usage: 9.8+ MB


In [7]:
columns_report(contrib_cleaned)

Rows: 106692

               Column  Unique  Duplicates  Missing  Missing%   HitRate%
         npo_accnt_id  106692           0        0  0.000000 100.000000
              clnt_id  102898        3794        0  0.000000 100.000000
      accnt_pnsn_schm     292      106399       37  0.034679  99.965321
           slctn_nmbr       4      106688        0  0.000000 100.000000
     npo_accnt_status       2      106690        0  0.000000 100.000000
npo_accnt_status_date    7088       99604        0  0.000000 100.000000
             npo_blnc   42136       64555    55405 51.929854  48.070146
        npo_pmnts_sum   70135       36556     6683  6.263825  93.736175
       npo_pmnts_nmbr     227      106464     6683  6.263825  93.736175
   npo_frst_pmnt_date    5039      101652     6683  6.263825  93.736175
    npo_lst_pmnt_date    5245      101446     6683  6.263825  93.736175
         npo_ttl_incm   87356       19335     5804  5.439958  94.560042


In [8]:
contrib_cleaned['accnt_pnsn_schm'] = contrib_cleaned.accnt_pnsn_schm.astype(float).fillna(
    contrib_cleaned.accnt_pnsn_schm.mode()[0]
    ).astype(int).astype('category')

In [9]:
contrib_cleaned[['npo_blnc', 'npo_pmnts_sum']].corr()

Unnamed: 0,npo_blnc,npo_pmnts_sum
npo_blnc,1.0,0.998212
npo_pmnts_sum,0.998212,1.0


In [10]:
blnc_filter = (((contrib_cleaned.npo_blnc.isna()) | (contrib_cleaned.npo_blnc < 0)) & (contrib_cleaned.npo_accnt_status == 1))
contrib_cleaned.loc[blnc_filter, 'npo_blnc'] = contrib_cleaned.loc[blnc_filter, 'npo_pmnts_sum'].fillna(0)
contrib_cleaned.loc[contrib_cleaned.npo_accnt_status == 0, 'npo_blnc'] = np.nan
contrib_cleaned.loc[contrib_cleaned.npo_pmnts_sum < 0, 'npo_pmnts_sum'] = np.nan
contrib_cleaned.loc[contrib_cleaned.npo_ttl_incm < 0, 'npo_ttl_incm'] = 0
contrib_cleaned['npo_ttl_incm'] = contrib_cleaned.npo_ttl_incm.fillna(0)

In [11]:
contrib_cleaned.dropna(subset=['npo_pmnts_sum'], inplace=True)
contrib_cleaned.reset_index(drop=True, inplace=True)

In [12]:
columns_report(contrib_cleaned)

Rows: 99999

               Column  Unique  Duplicates  Missing  Missing%   HitRate%
         npo_accnt_id   99999           0        0  0.000000 100.000000
              clnt_id   97439        2560        0  0.000000 100.000000
      accnt_pnsn_schm     267       99732        0  0.000000 100.000000
           slctn_nmbr       4       99995        0  0.000000 100.000000
     npo_accnt_status       2       99997        0  0.000000 100.000000
npo_accnt_status_date    6988       93011        0  0.000000 100.000000
             npo_blnc   35897       64101    56345 56.345563  43.654437
        npo_pmnts_sum   70133       29866        0  0.000000 100.000000
       npo_pmnts_nmbr     227       99772        0  0.000000 100.000000
   npo_frst_pmnt_date    5039       94960        0  0.000000 100.000000
    npo_lst_pmnt_date    5245       94754        0  0.000000 100.000000
         npo_ttl_incm   84746       15253        0  0.000000 100.000000


In [13]:
contrib_cleaned.describe(percentiles=[.01, .1, .25, .5, .75, .9, .99])

Unnamed: 0,slctn_nmbr,npo_accnt_status,npo_blnc,npo_pmnts_sum,npo_pmnts_nmbr,npo_ttl_incm
count,99999.0,99999.0,43654.0,99999.0,99999.0,99999.0
mean,1.433184,0.436544,150418.6,96486.71,39.99794,25721.82
std,0.835755,0.49596,5577909.0,3561202.0,46.873419,594509.3
min,0.0,0.0,0.0,-0.0,1.0,0.0
1%,0.0,0.0,48.78,20.42,1.0,0.0
10%,0.0,0.0,315.106,126.01,1.0,92.11
25%,1.0,0.0,1490.555,2572.095,4.0,611.815
50%,2.0,0.0,14558.1,19065.04,23.0,3580.88
75%,2.0,1.0,107283.6,76590.23,62.0,15633.2
90%,2.0,1.0,289358.3,195091.1,103.0,44703.13


In [15]:
# contrib_cleaned.to_feather(data_path + '\\interim\\contrib_test.frt')

  if _pandas_api.is_sparse(col):
