In [1]:
import os
import pandas as pd
import numpy as np
import pgeocode
from typing import defaultdict
from pandarallel import pandarallel

In [2]:
data_path = '..\\data\\'

In [3]:
clients_data = pd.read_csv(data_path + 'train_data_npo\\npo_clnts.csv')
contributors_data = pd.read_csv(data_path + 'train_data_npo\\npo_cntrbtrs.csv')
transactions_data = pd.read_csv(data_path + 'train_data_npo\\npo_trnsctns.csv')

## Data Preprocessing

In [4]:
def columns_report(data: pd.DataFrame) -> None:

    print('Rows:', data.shape[0], end='\n'*2)

    stats = defaultdict(list)
    for col in data.columns:
        stats['Column'].append(col)
        stats['Unique'].append(data[col].nunique())
        stats['Duplicates'].append(data[col].duplicated().sum())
        stats['Missing'].append(data[col].isna().sum())
        stats['Missing%'].append(data[col].isna().mean() * 100)
        stats['HitRate%'].append((1 - data[col].isna().mean()) * 100)
    stats = pd.DataFrame(stats)

    print(stats.to_string(index=False))

### Clients

In [5]:
clients_cleaned = clients_data.copy()

In [6]:
clients_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230446 entries, 0 to 230445
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   clnt_id     230446 non-null  object 
 1   gndr        230442 non-null  float64
 2   slctn_nmbr  230446 non-null  int64  
 3   age         230423 non-null  float64
 4   brth_yr     230423 non-null  float64
 5   pstl_code   220185 non-null  object 
 6   city        230446 non-null  int64  
dtypes: float64(3), int64(2), object(2)
memory usage: 12.3+ MB


In [7]:
postal_code_pattern = r'^\d{6}$'
clients_cleaned['pstl_code'] = clients_cleaned.pstl_code.str.replace('.0', '')
clients_cleaned.loc[~(clients_cleaned.pstl_code.str.match(postal_code_pattern, na=False)), 'pstl_code'] = np.nan
clients_cleaned['pstl_code'] = clients_cleaned.pstl_code.astype(float).fillna(-1).astype(int).astype(str).replace('-1', '_MISSING_')
pcode_mode = clients_cleaned.loc[clients_cleaned.pstl_code != '_MISSING_', 'pstl_code'].mode()[0]
clients_cleaned['pstl_code']= clients_cleaned.pstl_code.replace('_MISSING_', pcode_mode)

In [8]:
postal_decoder = pgeocode.Nominatim('ru')

def get_geosub_from_postalcode(postal_code: str, decoder=postal_decoder) -> str:

    geo = decoder.query_postal_code(postal_code).state_name

    return geo

In [9]:
print(os.cpu_count())

16


In [10]:
pandarallel.initialize(progress_bar=True, nb_workers=12)
clients_cleaned['geo'] = clients_cleaned.pstl_code.parallel_apply(get_geosub_from_postalcode)

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=19204), Label(value='0 / 19204')))…

In [13]:
pd.to_feather(clients_cleaned, '../data/interim/clnts.frt', index=False)

AttributeError: module 'pandas' has no attribute 'to_feather'

In [14]:
!pip show pandas

Name: pandas
Version: 2.0.0
Summary: Powerful data structures for data analysis, time series, and statistics
Home-page: 
Author: 
Author-email: The Pandas Development Team <pandas-dev@python.org>
License: BSD 3-Clause License

Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team
All rights reserved.

Copyright (c) 2011-2023, Open source contributors.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright notice,
  this list of conditions and the following disclaimer in the documentation
  and/or other materials provided with the distribution.

* Neither the name of the copyright holder nor the names of its
  contributors may be used to endorse or promote

In [None]:
clients_cleaned = pd.read_feather('../data/interim/clnts.frt')

In [None]:
clients_cleaned.head()

Unnamed: 0,clnt_id,gndr,slctn_nmbr,age,brth_yr,pstl_code,city
0,0xD1930AC934CD0D4AB6141DF45637EFE4,1.0,2,74.0,1949.0,188544,0
1,0x25DCE99C94913C42A49F739DDA3AE81A,0.0,2,62.0,1961.0,398046,0
2,0xCF29021EFE24454693866565B7CAB0D8,1.0,0,69.0,1954.0,162609,0
3,0xA6D80BE13B6FEB4591C5E93F715C0B98,0.0,0,38.0,1984.0,162623,0
4,0x88C8CB57D2D6B14393894C0CBB8A9A4A,0.0,2,38.0,1984.0,173025,0


In [116]:
columns_report(clients_cleaned)

Rows: 230446

    Column  Unique  Duplicates  Missing  Missing%   HitRate%
   clnt_id  230407          39        0  0.000000 100.000000
      gndr       2      230443        4  0.001736  99.998264
slctn_nmbr       4      230442        0  0.000000 100.000000
       age     120      230325       23  0.009981  99.990019
   brth_yr     120      230325       23  0.009981  99.990019
 pstl_code   14494      215952        0  0.000000 100.000000
      city       2      230444        0  0.000000 100.000000


In [54]:
clients_data.dropna(subset=['gndr', 'age', 'brth_yr'], inplace=True)

In [64]:
int(float(clients_data.pstl_code.mode()[0]))

162600

In [52]:
clients_data[clients_data.gndr.isna()].clnt_id.unique().shape

(1,)

In [51]:
clients_data[clients_data.age.isna()].clnt_id.unique().shape

(20,)

In [53]:
clients_data[clients_data.brth_yr.isna()].clnt_id.unique().shape

(20,)

In [None]:
clients_data['gndr'] = clients_data.gndr.astype(int)
clients_data['age'] = clients_data.age.astype(int)
clients_data['brth_yr'] = clients_data.brth_yr.astype(int)
clients_data['pstl_code'] = clients_data.pstl_code.astype(int)

In [26]:
clients_data.drop_duplicates(keep='first', inplace=True)

In [27]:
columns_report(clients_data)

Rows: 230417

    Column  Unique  Duplicates  Missing%   HitRate%
   clnt_id  230407          10  0.000000 100.000000
      gndr       2      230414  0.001736  99.998264
slctn_nmbr       4      230413  0.000000 100.000000
       age     120      230296  0.009982  99.990018
   brth_yr     120      230296  0.009982  99.990018
 pstl_code   20869      209547  4.453230  95.546770
      city       2      230415  0.000000 100.000000


In [29]:
clients_data[clients_data.clnt_id.duplicated(keep=False)]

Unnamed: 0,clnt_id,gndr,slctn_nmbr,age,brth_yr,pstl_code,city
12290,0x8AFF8023389707479E4C8411AD581878,1.0,1,42.0,1981.0,143009.0,1
12291,0x8AFF8023389707479E4C8411AD581878,1.0,1,42.0,1981.0,143009.0,1
13424,0x6C8B491B724CC64CAD0054961C3CB732,0.0,1,32.0,1991.0,157202.0,0
13425,0x6C8B491B724CC64CAD0054961C3CB732,0.0,1,32.0,1991.0,157202.0,0
21856,0xA977088158E02044A0BD07B6A2DAD606,0.0,1,36.0,1986.0,0.0,0
21857,0xA977088158E02044A0BD07B6A2DAD606,0.0,1,36.0,1986.0,0.0,0
36579,0x11839E6B65F4334588180E7C2412ED28,1.0,1,56.0,1966.0,391430.0,0
36580,0x11839E6B65F4334588180E7C2412ED28,1.0,1,56.0,1966.0,391430.0,0
66835,0xB135FCE001FEE94AB39B077230F8616F,0.0,1,47.0,1976.0,185509.0,1
66836,0xB135FCE001FEE94AB39B077230F8616F,0.0,1,47.0,1976.0,185509.0,1


In [None]:
clients_data.reset_index(drop=True)