В данном jupyter notebook формируются дополнительные параметры для связей (u, v) на основе трансцендентных связей.

Учитываются все такие связи, а также, входящие и исходящие отдельно.

In [None]:
!pip install pyarrow
!pip install fastparquet

In [None]:
import pandas as pd
import numpy as np

from scipy import stats
from tqdm.notebook import tqdm

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings
warnings.filterwarnings('ignore')

# Чтение данных

In [2]:
%%time
test_df = pd.read_csv("./data/test.csv")

CPU times: total: 18.3 s
Wall time: 19.4 s


In [3]:
%%time
train_df = pd.read_csv("./data/train.csv")

CPU times: total: 1min 1s
Wall time: 2min 11s


In [4]:
%%time
attr_df = pd.read_csv("./data/attr.csv")

CPU times: total: 5.42 s
Wall time: 6.29 s


# Функция генерации фичей по одному эго графу

In [250]:
def generate_ego_features(ego_df: pd.DataFrame):
    '''Генерирует дополнительные фичи для эго-графа на основе трансцендентных связей
    
    Parameters
    ----------
    ego_df : pd.DataFrame
        Датафрейм одно эго-графа
        Должен включать слолбцы:
            ego_id: int
            u: int
            v: int
            x1: float
        Также может включать дополнительные данные (t, x2, x3),
            которые остануться в возвращённом датафрейме
        
    Returns
    -------
    ego_df : pd.DataFrame
        Расширенный дополнительныйми столбцами исходный датафрейм
        Генерируемые столбцы:
            'left_out_mean', 'left_in_mean', 'right_out_mean',
            'right_in_mean', 'left_mean', 'right_mean',
            'left_out_median', 'left_in_median', 'right_out_median',
            'right_in_median', 'left_median', 'right_median', 'reverse'
    '''
    ## u`, v` - узлы-соседи u, v
    
    ## x1 для обратной связи (v, u)
    reverse_df = pd.merge(
        ego_df,
        ego_df[['u', 'v', 'x1']],
        how='left',
        left_on=['u', 'v'],
        right_on=['v', 'u'],
        suffixes=['', '_y']
    )[['u', 'v', 'x1_y']]
    
    ## Связи (u, v`)
    left_out_df = pd.merge(
        ego_df,
        ego_df[['u', 'v', 'x1']],
        how='left',
        left_on=['u'],
        right_on=['u'],
        suffixes=['', '_y']
    )
    left_out_df = left_out_df[left_out_df['v'] != left_out_df['v_y']]

    ## Связи (u`, u)
    left_in_df = pd.merge(
        ego_df,
        ego_df[['u', 'v', 'x1']],
        how='left',
        left_on=['u'],
        right_on=['v'],
        suffixes=['', '_y']
    )
    left_in_df = left_in_df[left_in_df['v'] != left_in_df['u_y']]
    
    ## Связи (v, v`)
    right_out_df = pd.merge(
        ego_df,
        ego_df[['u', 'v', 'x1']],
        how='left',
        left_on=['v'],
        right_on=['u'],
        suffixes=['', '_y']
    )
    right_out_df = right_out_df[right_out_df['u'] != right_out_df['v_y']]

    ## Связи (u`, v)
    right_in_df = pd.merge(
        ego_df,
        ego_df[['u', 'v', 'x1']],
        how='left',
        left_on=['v'],
        right_on=['v'],
        suffixes=['', '_y']
    )
    right_in_df = right_in_df[right_in_df['u'] != right_in_df['u_y']]
    
    ## Связи (u, v`) и (v`, u)
    left_df = pd.concat(
        [left_out_df, left_in_df]
    )
    
    ## Связи (v, v`) и (u`, v)
    right_df = pd.concat(
        [right_out_df, right_in_df]
    )
    
    ## Объединение ego_df с полученными признаками
    ego_df = pd.merge(
        ego_df,
        left_out_df[['u', 'v', 'x1_y']].groupby(by=['u', 'v'], dropna=False).mean().reset_index()[['u', 'v', 'x1_y']]\
            .rename({'x1_y': 'left_out_mean'}, axis=1),
        how='left',
        on=['u', 'v']
    )

    ego_df = pd.merge(
        ego_df,
        left_in_df[['u', 'v', 'x1_y']].groupby(by=['u', 'v'], dropna=False).mean().reset_index()[['u', 'v', 'x1_y']]\
            .rename({'x1_y': 'left_in_mean'}, axis=1),
        how='left',
        on=['u', 'v']
    )

    ego_df = pd.merge(
        ego_df,
        right_out_df[['u', 'v', 'x1_y']].groupby(by=['u', 'v'], dropna=False).mean().reset_index()[['u', 'v', 'x1_y']]\
            .rename({'x1_y': 'right_out_mean'}, axis=1),
        how='left',
        on=['u', 'v']
    )

    ego_df = pd.merge(
        ego_df,
        right_in_df[['u', 'v', 'x1_y']].groupby(by=['u', 'v'], dropna=False).mean().reset_index()[['u', 'v', 'x1_y']]\
            .rename({'x1_y': 'right_in_mean'}, axis=1),
        how='left',
        on=['u', 'v']
    )

    ego_df = pd.merge(
        ego_df,
        left_df[['u', 'v', 'x1_y']].groupby(by=['u', 'v'], dropna=False).mean().reset_index()[['u', 'v', 'x1_y']]\
            .rename({'x1_y': 'left_mean'}, axis=1),
        how='left',
        on=['u', 'v']
    )

    ego_df = pd.merge(
        ego_df,
        right_df[['u', 'v', 'x1_y']].groupby(by=['u', 'v'], dropna=False).mean().reset_index()[['u', 'v', 'x1_y']]\
            .rename({'x1_y': 'right_mean'}, axis=1),
        how='left',
        on=['u', 'v']
    )

    ## Median
    ego_df = pd.merge(
        ego_df,
        left_out_df[['u', 'v', 'x1_y']].groupby(by=['u', 'v'], dropna=False).median().reset_index()[['u', 'v', 'x1_y']]\
            .rename({'x1_y': 'left_out_median'}, axis=1),
        how='left',
        on=['u', 'v']
    )

    ego_df = pd.merge(
        ego_df,
        left_in_df[['u', 'v', 'x1_y']].groupby(by=['u', 'v'], dropna=False).median().reset_index()[['u', 'v', 'x1_y']]\
            .rename({'x1_y': 'left_in_median'}, axis=1),
        how='left',
        on=['u', 'v']
    )

    ego_df = pd.merge(
        ego_df,
        right_out_df[['u', 'v', 'x1_y']].groupby(by=['u', 'v'], dropna=False).median().reset_index()[['u', 'v', 'x1_y']]\
            .rename({'x1_y': 'right_out_median'}, axis=1),
        how='left',
        on=['u', 'v']
    )

    ego_df = pd.merge(
        ego_df,
        right_in_df[['u', 'v', 'x1_y']].groupby(by=['u', 'v'], dropna=False).median().reset_index()[['u', 'v', 'x1_y']]\
            .rename({'x1_y': 'right_in_median'}, axis=1),
        how='left',
        on=['u', 'v']
    )

    ego_df = pd.merge(
        ego_df,
        left_df[['u', 'v', 'x1_y']].groupby(by=['u', 'v'], dropna=False).median().reset_index()[['u', 'v', 'x1_y']]\
            .rename({'x1_y': 'left_median'}, axis=1),
        how='left',
        on=['u', 'v']
    )

    ego_df = pd.merge(
        ego_df,
        right_df[['u', 'v', 'x1_y']].groupby(by=['u', 'v'], dropna=False).median().reset_index()[['u', 'v', 'x1_y']]\
            .rename({'x1_y': 'right_median'}, axis=1),
        how='left',
        on=['u', 'v']
    )

    ego_df = pd.merge(
        ego_df,
        reverse_df.rename({'x1_y': 'reverse'}, axis=1),
        how='left',
        on=['u', 'v']
    )

    
    ## Harmonic mean (depreciated)
    
    # ego_df = pd.merge(
    #     ego_df,
    #     left_df[['u', 'v', 'x1_y']].groupby(by=['u', 'v'], dropna=False).agg(stats.hmean).reset_index()[['u', 'v', 'x1_y']]\
    #         .rename({'x1_y': 'left_hmean'}, axis=1),
    #     how='left',
    #     on=['u', 'v']
    # )

    # ego_df = pd.merge(
    #     ego_df,
    #     right_df[['u', 'v', 'x1_y']].groupby(by=['u', 'v'], dropna=False).agg(stats.hmean).reset_index()[['u', 'v', 'x1_y']]\
    #         .rename({'x1_y': 'right_hmean'}, axis=1),
    #     how='left',
    #     on=['u', 'v']
    # )
    
    
    ## Рассчёт смежных показателей (depreciated)
    
    # new_ego_df['left_mean_div_by_mean_sum'] = new_ego_df['left_mean'] / (new_ego_df['left_mean'] + new_ego_df['right_mean'])
    # new_ego_df['right_mean_div_by_mean_sum'] = new_ego_df['right_mean'] / (new_ego_df['left_mean'] + new_ego_df['right_mean'])
    # new_ego_df['left_mean_right_mean_diff_abs'] = abs( new_ego_df['left_mean'] - new_ego_df['right_mean'] )

    # new_ego_df['left_median_div_by_median_sum'] = new_ego_df['left_median'] / (new_ego_df['left_median'] + new_ego_df['right_median'])
    # new_ego_df['right_median_div_by_median_sum'] = new_ego_df['right_median'] / (new_ego_df['left_median'] + new_ego_df['right_median'])
    # new_ego_df['left_median_right_median_diff_abs'] = abs( new_ego_df['left_median'] - new_ego_df['right_median'] )

    # new_ego_df['left_in_mean_div_by_in_mean_sum'] = new_ego_df['left_in_mean'] / (new_ego_df['left_in_mean'] + new_ego_df['right_in_mean'])
    # new_ego_df['right_in_mean_div_by_in_mean_sum'] = new_ego_df['right_in_mean'] / (new_ego_df['left_in_mean'] + new_ego_df['right_in_mean'])
    # new_ego_df['left_in_mean_right_in_mean_diff_abs'] = abs( new_ego_df['left_in_mean'] - new_ego_df['right_in_mean'] )

    # new_ego_df['left_out_mean_div_by_out_mean_sum'] = new_ego_df['left_out_mean'] / (new_ego_df['left_out_mean'] + new_ego_df['right_out_mean'])
    # new_ego_df['right_out_mean_div_by_out_mean_sum'] = new_ego_df['right_out_mean'] / (new_ego_df['left_out_mean'] + new_ego_df['right_out_mean'])
    # new_ego_df['left_out_mean_right_out_mean_diff_abs'] = abs( new_ego_df['left_out_mean'] - new_ego_df['right_out_mean'] )
    
    return ego_df

# Запуск функции в цикле и сохранение результатов в файлы

In [None]:
## Все ego_id
ego_ids = test_df['ego_id'].unique()

In [554]:
%%time
new_ego_list = list() ## Список для следующего сохранения

for ego_n, ego_id in tqdm(enumerate(ego_ids), total = len(ego_ids)):
    ## Получение одного эго-графа и рассчёт доп. параметров
    ego_df = test_df[test_df['ego_id'] == ego_id]
    ego_df = generate_ego_features(ego_df)
    new_ego_list.append(ego_df)
    
    ## Сохранение промежуточных результатов
    if (ego_n % 2500 == 0 or ego_n == len(ego_ids)-1) and ego_n != 0:
        new_ego_df = pd.concat(new_ego_list)
        new_ego_df.to_parquet(f'./data/graph_features/ego_{ego_n}.gz', index=None)
        new_ego_list = list()

  0%|          | 0/20596 [00:00<?, ?it/s]

CPU times: total: 55min 47s
Wall time: 1h 3min
