In [1]:
import os
import pandas as pd
import numpy as np
import pgeocode
from typing import defaultdict
from pandarallel import pandarallel

In [2]:
data_path = '..\\data\\'

In [3]:
clients_data = pd.read_csv(data_path + 'train_data_npo\\npo_clnts.csv')
contributors_data = pd.read_csv(data_path + 'train_data_npo\\npo_cntrbtrs.csv')
transactions_data = pd.read_csv(data_path + 'train_data_npo\\npo_trnsctns.csv')

## Data Preprocessing

In [42]:
def columns_report(data: pd.DataFrame) -> None:

    print('Rows:', data.shape[0], end='\n'*2)

    stats = defaultdict(list)
    for col in data.columns:
        stats['Column'].append(col)
        stats['Unique'].append(data[col].nunique())
        stats['Duplicates'].append(data[col].duplicated().sum())
        stats['Missing'].append(data[col].isna().sum())
        stats['Missing%'].append(data[col].isna().mean() * 100)
        stats['HitRate%'].append((1 - data[col].isna().mean()) * 100)
    stats = pd.DataFrame(stats)

    print(stats.to_string(index=False))

### Clients

In [70]:
clients_cleaned = clients_data.copy()

In [44]:
clients_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230446 entries, 0 to 230445
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   clnt_id     230446 non-null  object 
 1   gndr        230442 non-null  float64
 2   slctn_nmbr  230446 non-null  int64  
 3   age         230423 non-null  float64
 4   brth_yr     230423 non-null  float64
 5   pstl_code   220185 non-null  object 
 6   city        230446 non-null  int64  
dtypes: float64(3), int64(2), object(2)
memory usage: 12.3+ MB


In [71]:
columns_report(clients_cleaned)

Rows: 230446

    Column  Unique  Duplicates  Missing  Missing%   HitRate%
   clnt_id  230407          39        0  0.000000 100.000000
      gndr       2      230443        4  0.001736  99.998264
slctn_nmbr       4      230442        0  0.000000 100.000000
       age     120      230325       23  0.009981  99.990019
   brth_yr     120      230325       23  0.009981  99.990019
 pstl_code   20869      209576    10261  4.452670  95.547330
      city       2      230444        0  0.000000 100.000000


In [45]:
postal_code_pattern = r'^\d{6}$'
clients_cleaned['pstl_code'] = clients_cleaned.pstl_code.str.replace('.0', '')
clients_cleaned.loc[~(clients_cleaned.pstl_code.str.match(postal_code_pattern, na=False)), 'pstl_code'] = np.nan
clients_cleaned['pstl_code'] = clients_cleaned.pstl_code.astype(float).fillna(-1).astype(int).astype(str).replace('-1', '_MISSING_')
pcode_mode = clients_cleaned.loc[clients_cleaned.pstl_code != '_MISSING_', 'pstl_code'].mode()[0]
clients_cleaned['pstl_code']= clients_cleaned.pstl_code.replace('_MISSING_', pcode_mode)

In [46]:
postal_decoder = pgeocode.Nominatim('ru')

def get_geosub_from_postalcode(postal_code: str, decoder=postal_decoder) -> str:

    geo = decoder.query_postal_code(postal_code).state_name

    return geo

In [47]:
print(os.cpu_count())

16


In [48]:
# получение геопозиции по почтовому индексу
pandarallel.initialize(progress_bar=True, nb_workers=12)
clients_cleaned['geo'] = clients_cleaned.pstl_code.parallel_apply(get_geosub_from_postalcode)

In [56]:
# заполнение недостающих геопозиций по индексам с такой же частью, отвечающей за код города
no_geo_pstl_codes = clients_cleaned.loc[clients_cleaned.geo.isna(), 'pstl_code'].unique().tolist()

for pstl_code in no_geo_pstl_codes:
    no_geo_city_code = pstl_code[:3]
    try:
        twin_geo = clients_cleaned.loc[(clients_cleaned.pstl_code.str[:3] == no_geo_city_code) &\
            (clients_cleaned.geo.notna()), 'geo'].unique()[0]
    except IndexError:
        twin_geo = None
    clients_cleaned.loc[clients_cleaned.pstl_code == pstl_code, 'geo'] = twin_geo

In [65]:
clients_cleaned.dropna(subset=['gndr', 'age', 'brth_yr', 'geo'], inplace=True)
clients_cleaned.drop_duplicates(subset=['clnt_id'], keep='first', inplace=True)
clients_cleaned.reset_index(drop=True, inplace=True)

In [85]:
clients_cleaned['gndr'] = clients_cleaned.gndr.astype(int)
clients_cleaned['age'] = clients_cleaned.age.astype(int)
clients_cleaned['brth_yr'] = clients_cleaned.brth_yr.astype(int)

In [186]:
columns_report(clients_cleaned)

Rows: 229519

    Column  Unique  Duplicates  Missing  Missing%  HitRate%
   clnt_id  229519           0        0       0.0     100.0
      gndr       2      229517        0       0.0     100.0
slctn_nmbr       4      229515        0       0.0     100.0
       age     120      229399        0       0.0     100.0
   brth_yr     120      229399        0       0.0     100.0
 pstl_code   14211      215308        0       0.0     100.0
      city       2      229517        0       0.0     100.0
       geo      78      229441        0       0.0     100.0


In [187]:
clients_cleaned.describe(percentiles=[.01, .1, .25, .5, .75, .9, .99])

Unnamed: 0,gndr,slctn_nmbr,age,brth_yr,city
count,229519.0,229519.0,229519.0,229519.0,229519.0
mean,0.439454,1.438896,55.755876,1966.907912,0.242202
std,0.496322,0.84823,21.520319,21.531225,0.428417
min,0.0,0.0,-976.0,1.0,0.0
1%,0.0,0.0,30.0,1931.0,0.0
10%,0.0,0.0,38.0,1951.0,0.0
25%,0.0,1.0,46.0,1958.0,0.0
50%,0.0,2.0,56.0,1966.0,0.0
75%,1.0,2.0,65.0,1977.0,0.0
90%,1.0,2.0,71.0,1984.0,1.0


In [188]:
clients_cleaned = clients_cleaned.loc[(clients_cleaned.age >= 14) & (clients_cleaned.age <= 90)].reset_index(drop=True)

In [189]:
clients_cleaned.describe(percentiles=[.01, .05, .1, .25, .5, .75, .9, .95, .99])

Unnamed: 0,gndr,slctn_nmbr,age,brth_yr,city
count,227007.0,227007.0,227007.0,227007.0,227007.0
mean,0.437418,1.443088,55.386274,1967.281392,0.237993
std,0.496069,0.851188,12.430646,12.428508,0.425856
min,0.0,0.0,14.0,1932.0,0.0
1%,0.0,0.0,30.0,1938.0,0.0
5%,0.0,0.0,35.0,1948.0,0.0
10%,0.0,0.0,38.0,1952.0,0.0
25%,0.0,1.0,46.0,1958.0,0.0
50%,0.0,2.0,56.0,1966.0,0.0
75%,1.0,2.0,64.0,1977.0,0.0


In [190]:
# clients_cleaned.to_feather('../data/interim/clnts.frt')

  if _pandas_api.is_sparse(col):


In [197]:
clients_cleaned = pd.read_feather('../data/interim/clnts.frt')

In [198]:
print('Сохраненная доля исходной выборки: {:.2%}'.format(clients_cleaned.shape[0] / clients_data.shape[0]))

Сохраненная доля исходной выборки: 98.51%
