# Exploratory Data Analysis

In [1]:
import pandas as pd
import numpy as np
import sketch
import pgeocode

## Read the data

In [2]:
clients_path = 'train_data_npo/npo_clnts.csv'
contributors_path = 'train_data_npo/npo_cntrbtrs.csv'
transactions_path = 'train_data_npo/npo_trnsctns.csv'

In [3]:
clients_features = {
    "clnt_id" : "ID клиента",
    "slctn_nmbr" : "Номер выборки",
    "gndr" : "Пол клиента: м: 0, ж: 1",
    "age" : "Возраст клиента, лет",
    "brth_yr" : "Год рождения клиента",
    "pstl_code" : "Почтовый индекс",
    "city" : "Живет клиент в городе (1) или нет (0)"
}

contributors_features = {
    "npo_accnt_id" : "ID счета клиента",
    "client_id" : "ID клиента",
    "slctn_nmbr" : "Номер выборки",
    "accnt_pnsn_schm" : "Условный код пенсионной схемы счета клиента",
    "npo_accnt_status" : "Статус счета клиента: Закрыт: 0, Открыт: 1",
    "npo_accnt_status_date" : "Дата статуса счета клиента",
    "npo_blnc" : "Баланс счета клиента, руб.",
    "npo_pmnts_sum" : "Сумма взносов клиента, руб.",
    "npo_pmnts_nmbr" : "Число взносов клиента",
    "npo_frst_pmnt_date" : "Дата первого взноса клиента",
    "npo_lst_pmnt_date" : "Дата последнего взноса клиента",
    "npo_ttl_incm" : "Сумма дохода, начисленного на счет клиента, руб."
}

transactions_features = {
    "npo_accnt_id" : "ID клиента",
    "slctn_nmbr" : "Номер выборки",
    "npo_sum" : "Размер операции по счету клиента, руб.",
    "npo_operation_date" : "Дата взноса клиента",
    "npo_operation_group" : "Тип операции по счету клиента: Поступление взносов: 0, Начисление дохода: 1",
}

In [49]:
clients_df = pd.read_csv(clients_path)
contributors_df = pd.read_csv(contributors_path)
transactions_df = pd.read_csv(transactions_path)

In [50]:
contributors_df.head(3)

Unnamed: 0,npo_accnt_id,clnt_id,accnt_pnsn_schm,slctn_nmbr,npo_accnt_status,npo_accnt_status_date,npo_blnc,npo_pmnts_sum,npo_pmnts_nmbr,npo_frst_pmnt_date,npo_lst_pmnt_date,npo_ttl_incm
0,0x90B7458B8CBFF24980DEC312BA4A1AF5,0x85390230E8955E4FA736E62B0F0E3844,1.0,0,1,2001-05-14,10158.96,2276.42,1.0,2005-08-31,2005-08-31,5638.83
1,0xC64D3161D31A8441A65224792D370CB3,0xC2B51FD4FE57F7479210FD7258DF5B0B,3.0,2,0,2018-10-30,,230084.4,55.0,2013-03-07,2017-09-10,39875.3
2,0xC92F1AA5587E2348BEF17432FBD6C2E6,0x8EC850934FF06A4AA0A856CF43B8D666,4.0,1,0,2014-01-20,,7921.95,8.0,2012-11-26,2013-06-30,207.5


## Basic EDA

### `clients_df`

### Postal codes and state names

In [6]:
#clients_df.sketch.howto("Perform basic EDA on this dataframe")

# Get summary statistics
clients_df.describe()

# Get descriptive data
clients_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230446 entries, 0 to 230445
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   clnt_id     230446 non-null  object 
 1   gndr        230442 non-null  float64
 2   slctn_nmbr  230446 non-null  int64  
 3   age         230423 non-null  float64
 4   brth_yr     230423 non-null  float64
 5   pstl_code   220185 non-null  object 
 6   city        230446 non-null  int64  
dtypes: float64(3), int64(2), object(2)
memory usage: 12.3+ MB


Let's get the regions of the clients:

In [8]:
print(f"There are {clients_df['pstl_code'].isnull().sum()} Nones in postal code column")

There are 10261 Nones in postal code column


In [27]:
unique_pstl_codes = clients_df['pstl_code'].unique()
for i, code in enumerate(unique_pstl_codes):
    try:
        intcode = int(float(code))
    except:
        print(i, code)

5 nan
879 ******
2856 119 454
3067 366 325
3354 366 300
3466 107 589
3787 РОССИЯ
5139 366 013
5953 117 216
6568 366 401
6798 117 463
7170 364 043
8356 366 500
9504 366 815
9667 366 522
9912 603 105
10016 366 201
10166 366 200
11063 188 309
11858 366 810
11976 129 345
12036 364 046
12326 364 022
12901 171 984
13211 364 024
14151 366 041
15014 366 236
15150 398 070
15175 366 216
15437 364 038
15572 366 313
16114 630 090
16171 364 058
16264 366 521
16450 366 613
16561 364 000
16577 652 050
16981 152 920
17106 364 006
17230 143 915
17334 30.08.
17484 366 204
18049 141 407
18164 366 222
18310 364 020
18471 630 533
18526 125 212
18644 366 100
18731 390 010
19155 364 047
19274 364 035
20138 364 059
20172 366 701
20188 366 404
20544 БЕЛАРУСЬ
20668 366 818
20820 366 314


In [55]:
# Get a list of unique values in the pstl_code column
unique_pstl_codes = clients_df['pstl_code'].unique()

# Create a list of values to remove from the pstl_code column
values_to_remove = ['nan', 'None', 'null', '******', 'БЕЛАРУСЬ', 'РОССИЯ', '30.08.']

# Filter out the values to remove from the unique list of pstl codes
filtered_pstl_codes = [x for x in unique_pstl_codes if x not in values_to_remove]

# Replace the values to remove with NaN in the pstl_code column
clients_df['pstl_code'] = clients_df['pstl_code'].replace(values_to_remove, np.nan)

In [58]:
nomi = pgeocode.Nominatim('ru')
clients_df['state_name'] = clients_df['pstl_code'].dropna().str.split().str.join('').apply(lambda x: nomi.query_postal_code(round(float(x))).state_name)
clients_df.head(3)

Unnamed: 0,clnt_id,gndr,slctn_nmbr,age,brth_yr,pstl_code,city,state_name
0,0xD1930AC934CD0D4AB6141DF45637EFE4,1.0,2,74.0,1949.0,188544.0,0,Ленинградская Область
1,0x25DCE99C94913C42A49F739DDA3AE81A,0.0,2,62.0,1961.0,398046.0,0,Липецкая Область
2,0xCF29021EFE24454693866565B7CAB0D8,1.0,0,69.0,1954.0,162609.0,0,Вологодская Область


### Age

In [61]:
clients_df['age'] = clients_df['age'].apply(lambda x: np.nan if x < 0 or x > 110 else x)

### Gender

In [66]:
clients_df['gndr'].unique()

array([ 1.,  0., nan])

### City

In [75]:
# Get the number of rows per unique value in column city
city_info = clients_df.groupby('city').size()
print(f"There are {city_info[1]} clients who live in a city and {city_info[0]} clients who live outside the cities")

There are 56157 clients who live in a city and 174289 clients who live outside the cities


### Drop all nans and save it for future calculations

In [80]:
clients_df.dropna(axis=0, how='any', inplace=True)
clients_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 209976 entries, 0 to 230445
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   clnt_id     209976 non-null  object 
 1   gndr        209976 non-null  float64
 2   slctn_nmbr  209976 non-null  int64  
 3   age         209976 non-null  float64
 4   brth_yr     209976 non-null  float64
 5   pstl_code   209976 non-null  object 
 6   city        209976 non-null  int64  
 7   state_name  209976 non-null  object 
dtypes: float64(3), int64(2), object(3)
memory usage: 14.4+ MB


In [82]:
clients_df.to_csv('clean_data/clients_df.csv', index=False)

### `contributors_df`

In [11]:
contributors_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248947 entries, 0 to 248946
Data columns (total 12 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   npo_accnt_id           248947 non-null  object 
 1   clnt_id                248947 non-null  object 
 2   accnt_pnsn_schm        248866 non-null  float64
 3   slctn_nmbr             248947 non-null  int64  
 4   npo_accnt_status       248947 non-null  int64  
 5   npo_accnt_status_date  248947 non-null  object 
 6   npo_blnc               119305 non-null  float64
 7   npo_pmnts_sum          233181 non-null  float64
 8   npo_pmnts_nmbr         233181 non-null  float64
 9   npo_frst_pmnt_date     233181 non-null  object 
 10  npo_lst_pmnt_date      233181 non-null  object 
 11  npo_ttl_incm           235141 non-null  float64
dtypes: float64(5), int64(2), object(5)
memory usage: 22.8+ MB


In [51]:
contributors_df.loc[contributors_df["npo_pmnts_nmbr"].apply(lambda x: not pd.isna(x)), "npo_accnt_id"]

0         0x90B7458B8CBFF24980DEC312BA4A1AF5
1         0xC64D3161D31A8441A65224792D370CB3
2         0xC92F1AA5587E2348BEF17432FBD6C2E6
3         0x8DDD2D186B990F43AD5FC9F5C7124E16
5         0x52646310E2F12B4C811948D16CB97B7C
                         ...                
248942    0xBCC44D37FC13140845A0699ED3176332
248943    0x8120005056AB57EC11EE46637F6FF1F2
248944    0x8F61B56D1DD804804FDE77B593B577C5
248945    0xA4EF00505692D0BC11EE2608583314E2
248946    0xA4F400505692D0BC11EE400B261BE776
Name: npo_accnt_id, Length: 233181, dtype: object

In [54]:
transactions_df.loc[transactions_df["npo_accnt_id"] == "0xBCC44D37FC13140845A0699ED3176332"]

Unnamed: 0,npo_accnt_id,npo_sum,slctn_nmbr,npo_operation_date,npo_operation_group
12323774,0xBCC44D37FC13140845A0699ED3176332,813.0,3,2022-02-25 15:00:01,0
