In [1]:
import pandas as pd
import numpy as np
from pandarallel import pandarallel

In [2]:
data_path = '..\\data\\'

In [3]:
# фичи на внешних источниках будут зависеть от клиентских данных (в частности, от геопозиции) и временной точки таргета
clnts_data = pd.read_feather(data_path + 'interim\\clnts.frt')
clnts_must_cols = [
    'clnt_id',
    'geo'
]

target_data = pd.read_feather(data_path + 'interim\\correct_target.frt')
target_data['year'] = target_data.quarter.astype(str).str[:4].astype(int)
target_must_cols = [
    'clnt_id',
    'year',
    'quarter',
    'transactions_count',
    'paid_avg_correct'
]

In [4]:
subjects = ['край', 'область', 'республика', 'москва', 'санкт-петербург', 'российская федерация']
stop_words = subjects[:3]


def rm_stop_words(s: str, stop_words=stop_words):

    return ' '.join(w for w in s.split() if w not in stop_words)

In [5]:
pandarallel.initialize(progress_bar=False, nb_workers=12)

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [6]:
clnts_data['geo'] = clnts_data.geo\
    .str.strip()\
    .str.lower()\
    .str.replace(r'\s*\(.*\)', '', regex=True)\
    .parallel_apply(rm_stop_words)\
    .str.strip()

In [7]:
target_clnt_data = target_data[target_must_cols].merge(clnts_data[clnts_must_cols], on='clnt_id', how='left')\
    .sort_values(['clnt_id', 'quarter']).reset_index(drop=True)
target_clnt_data['quarter'] = target_clnt_data.quarter.astype(str)

In [8]:
external_ftrs_data = target_clnt_data.copy()

In [9]:
def generate_ext_feature(target_data: pd.DataFrame, ext_data_local: pd.DataFrame,
                         ext_data_global: pd.DataFrame, value_col: str, id_col: str = 'clnt_id',
                         ext_time_col: str = 'quarter', geo: bool = True) -> pd.DataFrame:

    X = pd.merge(target_data, ext_data_local[[ext_time_col, value_col, 'geo'] if geo else [ext_time_col, value_col]],
                 on=[ext_time_col, 'geo'] if geo else [ext_time_col], how='left')\
                     .drop_duplicates(subset=[id_col, 'quarter'], keep='first')
    X = pd.merge(X, ext_data_global[[ext_time_col, value_col]], on=ext_time_col, how='left')\
        .drop_duplicates(subset=[id_col, 'quarter'], keep='first')
    X[value_col + '_x'] = X[value_col + '_x'].fillna(X[value_col + '_y'])
    X[value_col + '_x'] = X[value_col + '_x'].fillna(np.mean(X[value_col + '_x']))
    X.rename(columns={value_col + '_x': value_col}, inplace=True)
    X.drop(columns=[value_col + '_y'], inplace=True)

    return X[[id_col, 'quarter', value_col]]

### Соотношение среднедушевых денежных доходов населения с величиной прожиточного минимума (процент)

In [10]:
avgpercapinc_subsistmin_ratio = pd.read_feather(data_path + 'interim\\avgpercapinc_subsistmin_ratio.frt')
avgpercapinc_subsistmin_ratio.rename(columns={'date': 'quarter'}, inplace=True)
avgpercapinc_subsistmin_ratio_rf = avgpercapinc_subsistmin_ratio.loc[
    avgpercapinc_subsistmin_ratio.geo == 'российская федерация'].reset_index(drop=True)

In [13]:
ext_ftr_1 = generate_ext_feature(
    target_clnt_data,
    avgpercapinc_subsistmin_ratio,
    avgpercapinc_subsistmin_ratio_rf,
    'avgpercapinc_subsistmin_ratio'
)

### Возрастной коэффициент смертности (промилле (0,1 процента))

In [16]:
agespec_death_rate = pd.read_feather(data_path + 'interim\\agespec_death_rate.frt')
agespec_death_rate.rename(columns={'date': 'quarter'}, inplace=True)
agespec_death_rate_rf = agespec_death_rate.loc[
    agespec_death_rate.geo == 'российская федерация'].reset_index(drop=True)

In [17]:
ext_ftr_2 = generate_ext_feature(
    target_clnt_data,
    agespec_death_rate,
    agespec_death_rate_rf,
    'agespec_death_rate',
    ext_time_col='year'
)

### Ожидаемая продолжительность жизни при рождении

In [18]:
lifeexp_at_birth = pd.read_feather(data_path + 'interim\\lifeexp_at_birth.frt')
lifeexp_at_birth.rename(columns={'date': 'quarter'}, inplace=True)
lifeexp_at_birth_rf = lifeexp_at_birth.loc[
    lifeexp_at_birth.geo == 'российская федерация'].reset_index(drop=True)

In [19]:
ext_ftr_3 = generate_ext_feature(
    target_clnt_data,
    lifeexp_at_birth,
    lifeexp_at_birth_rf,
    'lifeexp_at_birth',
    ext_time_col='year'
)

### Число умерших за год (человек, значение показателя за год)

In [20]:
deaths_per_year = pd.read_feather(data_path + 'interim\\deaths_per_year.frt')
deaths_per_year.rename(columns={'date': 'quarter'}, inplace=True)
deaths_per_year_rf = deaths_per_year.loc[
    deaths_per_year.geo == 'российская федерация'].reset_index(drop=True)

In [21]:
ext_ftr_4 = generate_ext_feature(
    target_clnt_data,
    deaths_per_year,
    deaths_per_year_rf,
    'deaths_per_year',
    ext_time_col='year'
)

### Индекс потребительских цен на товары и услуги к концу предыдущего месяца

In [22]:
cpi_data = pd.read_feather(data_path + 'interim\\cpi.frt')
cpi_data['date'] = cpi_data.date.astype(str)
cpi_data['geo'] = 'российская федерация'
cpi_data.rename(columns={'date': 'quarter'}, inplace=True)
cpi_data_rf = cpi_data.loc[
    cpi_data.geo == 'российская федерация'].reset_index(drop=True)

In [23]:
ext_ftr_5 = generate_ext_feature(
    target_clnt_data,
    cpi_data,
    cpi_data_rf,
    'cpi'
)

### Инфляция по месяцам в годовом исчислении

In [24]:
inflation_data = pd.read_feather(data_path + 'interim\\inflation.frt')
inflation_data['date'] = inflation_data.date.astype(str)
inflation_data['geo'] = 'российская федерация'
inflation_data.rename(columns={'date': 'quarter'}, inplace=True)
inflation_data_rf = inflation_data.loc[
    inflation_data.geo == 'российская федерация'].reset_index(drop=True)

In [25]:
ext_ftr_6 = generate_ext_feature(
    target_clnt_data,
    inflation_data,
    inflation_data_rf,
    'inflation'
)

In [26]:
key_merge = ['clnt_id', 'quarter']
ext_ftrs = ext_ftr_1\
    .merge(ext_ftr_2, on=key_merge, how='inner')\
    .merge(ext_ftr_3, on=key_merge, how='inner')\
    .merge(ext_ftr_4, on=key_merge, how='inner')\
    .merge(ext_ftr_5, on=key_merge, how='inner')\
    .merge(ext_ftr_6, on=key_merge, how='inner')

In [32]:
# ext_ftrs.to_feather(data_path + 'interim\\external_features.frt')

  if _pandas_api.is_sparse(col):
