In [3]:
import pandas as pd
import numpy as np
from pathlib import Path

In [170]:
DATA_PATH = Path('trivec_internship/data')
train_path = DATA_PATH / 'polyphar_train.csv'
val_path = DATA_PATH / 'polyphar_validate.csv'
test_path = DATA_PATH / 'polyphar_test.csv'

In [171]:
df_train = pd.read_csv(train_path)
df_val = pd.read_csv(val_path)
df_test = pd.read_csv(test_path)

df_train.head()

Unnamed: 0,from,rel,to
0,243,442,441
1,238,442,526
2,451,442,299
3,230,442,526
4,297,442,622


In [172]:
def print_statistics(df):
    from_ = set(df['from'])
    to = set(df['to'])
    print('from:', len(from_))
    print('to:', len(to))
    print('from & to:', len(from_.intersection(to)))
    print('from - to:', len(from_.difference(to)))
    print('to - from:', len(to.difference(from_)))

In [173]:
print('Train:')
print_statistics(df_train)
print('===========')

print('Validate:')
print_statistics(df_val)
print('===========')

print('Test:')
print_statistics(df_test)
print('===========')

Train:
from: 616
to: 634
from & to: 605
from - to: 11
to - from: 29
Validate:
from: 611
to: 632
from & to: 598
from - to: 13
to - from: 34
Test:
from: 611
to: 632
from & to: 599
from - to: 12
to - from: 33


В каждом из наборов есть лекраства, которые присутсвуют в отношениях только с одной стороны. Проанализируем, нет ли ситуации, что в val/test есть лекарство в from/to, а в train его нет с этой стороны

In [174]:
def check_equaity(df_check, check_title, dfs_titles):
    from_ = set(df_check['from'])
    to = set(df_check['to'])
    diff_from_to = from_.difference(to)
    diff_to_from = to.difference(from_)
    print(check_title)
    print('from - to:', len(diff_from_to))
    print('to - from:', len(diff_to_from))
    print('===========')
    
    for df, title in dfs_titles:
        print(title)
        print('diff_from_to & to:', len(set(df['to']).intersection(diff_from_to)))
        print('diff_to_from & from:', len(set(df['from']).intersection(diff_to_from)))
        print('===========')

In [175]:
check_equaity(df_train, 'train', [(df_val, 'val'), (df_test, 'test')])

train
from - to: 11
to - from: 29
val
diff_from_to & to: 0
diff_to_from & from: 0
test
diff_from_to & to: 0
diff_to_from & from: 0


Это означает, что все лекарства, которые встречаются во взаимодействиях с одной стороны в train, встречаются с "этой же стороны" и в validate, и в test.

Добавим новые лекарства в val и test так, чтобы их не было в train. При этом разобьем "новизну" на 3 категории: 
1. Лекарства нет в train
2. Лекарство есть в train, но встречается только в from, в val и test - в from и to
3. Лекарство есть в train, но встречается только в to, в val и test - в from и to

In [176]:
def check_ratio(df_train, df_val, df_test):
    all_train = set(df_train['from']).union(df_train['to'])
    all_val = set(df_val['from']).union(df_val['to'])
    all_test = set(df_test['from']).union(df_test['to'])
    print('train:', len(all_train))
    print('val:', len(all_val))
    print('test:', len(all_test))
    print('===========')
    print('val - train:', all_val.difference(all_train))
    print('test - train:', all_test.difference(all_train))
    print('train - val:', all_train.difference(all_val))
    print('train - test:', all_train.difference(all_test))

In [177]:
check_ratio(df_train, df_val, df_test)

train: 645
val: 645
test: 644
val - train: set()
test - train: set()
train - val: set()
train - test: {99}


In [178]:
from_cnt = dict(df_train['from'].value_counts())
from_cnt

{202: 52798,
 517: 51536,
 208: 49059,
 398: 43424,
 478: 42935,
 127: 42776,
 512: 38876,
 217: 37717,
 534: 36108,
 82: 34942,
 461: 33499,
 479: 32810,
 593: 31955,
 622: 31838,
 403: 31685,
 639: 31166,
 296: 30349,
 415: 30269,
 238: 29986,
 32: 29903,
 196: 29868,
 585: 29479,
 406: 29450,
 199: 29093,
 614: 29002,
 285: 28467,
 373: 28383,
 157: 26963,
 124: 26928,
 301: 26832,
 624: 26508,
 621: 26487,
 416: 26090,
 482: 25287,
 432: 24611,
 366: 24408,
 617: 24276,
 358: 24253,
 138: 24160,
 93: 24089,
 19: 23513,
 254: 23471,
 271: 22505,
 266: 22475,
 570: 22440,
 176: 22352,
 135: 21904,
 74: 21893,
 243: 21848,
 224: 21819,
 601: 21695,
 468: 21430,
 242: 20317,
 369: 20263,
 230: 19864,
 553: 19724,
 439: 19564,
 315: 19399,
 558: 19282,
 442: 19157,
 263: 19108,
 467: 19069,
 452: 18664,
 555: 18431,
 607: 18083,
 595: 17624,
 536: 17259,
 451: 16872,
 417: 16785,
 86: 16540,
 484: 16515,
 227: 16489,
 49: 16168,
 642: 15774,
 54: 15328,
 390: 15278,
 1: 15050,
 381: 150

In [179]:
to_cnt = dict(df_train['to'].value_counts())
to_cnt

{310: 44591,
 350: 39711,
 24: 39019,
 266: 38447,
 421: 35550,
 240: 34948,
 484: 34468,
 529: 34425,
 505: 33312,
 285: 32897,
 497: 31829,
 132: 29271,
 555: 29132,
 230: 28753,
 601: 28526,
 337: 27563,
 163: 26549,
 560: 26421,
 553: 25809,
 49: 25688,
 212: 25133,
 299: 24988,
 2: 24568,
 639: 24448,
 428: 23933,
 400: 23539,
 29: 22528,
 622: 22225,
 93: 22214,
 551: 22097,
 245: 21582,
 242: 21424,
 115: 21076,
 26: 21064,
 543: 20879,
 15: 20520,
 441: 20465,
 488: 20195,
 159: 20020,
 577: 19875,
 325: 19828,
 544: 19815,
 390: 19805,
 73: 19786,
 439: 19713,
 528: 19585,
 332: 19562,
 358: 19311,
 190: 19124,
 271: 19102,
 539: 19061,
 590: 19037,
 589: 19004,
 499: 18718,
 526: 18353,
 369: 18317,
 638: 18187,
 482: 18169,
 106: 17987,
 14: 17307,
 7: 17166,
 523: 17106,
 468: 16963,
 373: 16891,
 63: 16701,
 204: 16037,
 221: 15746,
 417: 15660,
 251: 15504,
 194: 15319,
 155: 15117,
 642: 15044,
 263: 14916,
 462: 14913,
 105: 14753,
 580: 14559,
 397: 14363,
 467: 14275,

Выберем лекарства следующим образом:
1. Уберем все вхождения лекарств, значение которых в 5000 <= from + to < 7000 и 0 <= from + to < 200
2. Уберем все вхождения лекарств из from (в to оставим), количество вхождений которых в 1000 <= from < 2000
2. Уберем все вхождения лекарств из to (в from оставим), количество вхождений которых в 1000 <= to < 2000

In [188]:
def select_by_border(remedies, set_range, check_from=True, check_to=True):
    deletion = []
    for remedy in remedies:
        summ = 0
        if check_from and remedy in from_cnt:
            summ += from_cnt[remedy]
        if check_to and remedy in to_cnt:
            summ += to_cnt[remedy]
        if summ in set_range:
            deletion.append(remedy)
    return set(deletion)

In [189]:
all_train_remedies = set(df_train['from']).union(df_train['to'])
full_deletion = select_by_border(all_train_remedies, set(range(5000, 7000)).union(set(range(0,200))))
print(full_deletion)

{513, 521, 16, 531, 20, 21, 535, 538, 542, 548, 37, 38, 39, 550, 42, 557, 561, 53, 567, 572, 61, 574, 64, 80, 598, 599, 89, 610, 99, 102, 107, 619, 110, 625, 118, 633, 634, 123, 126, 128, 129, 130, 131, 142, 143, 148, 153, 156, 169, 184, 185, 198, 206, 207, 214, 219, 228, 229, 260, 274, 278, 279, 300, 307, 308, 320, 323, 328, 340, 346, 359, 371, 380, 401, 422, 423, 427, 430, 431, 435, 448, 460, 471, 477, 485, 496, 503}


In [190]:
df_add = pd.DataFrame(columns = ['from', 'rel', 'to'])
df_add = pd.concat([df_add, df_train.loc[df_train['from'].isin(full_deletion) | df_train['to'].isin(full_deletion)]])
print('df_add.shape:', df_add.shape)
df_train = df_train.loc[~df_train['from'].isin(full_deletion) & ~df_train['to'].isin(full_deletion)]

df_add.shape: (310277, 3)


In [193]:
from_to_train_remedies = set(df_train['from']).intersection(df_train['to'])
from_deletion = select_by_border(from_to_train_remedies, set(range(1000,2000)), check_to=False)
to_deletion = select_by_border(from_to_train_remedies, set(range(1000,2000)), check_from=False)
print('from_deletion:', len(from_deletion))
print('to_deletion:', len(to_deletion))
print('intersection:', len(from_deletion.intersection(to_deletion)))

from_deletion: 49
to_deletion: 62
intersection: 14


In [194]:
from_deletion = set(list(from_deletion.difference(to_deletion))[:30])
to_deletion = set(list(to_deletion)[:30])

In [195]:
df_add = pd.concat([df_add, df_train.loc[df_train['from'].isin(from_deletion) | df_train['to'].isin(from_deletion)]])
print('df_add.shape:', df_add.shape)
df_train = df_train.loc[~df_train['from'].isin(from_deletion) & ~df_train['to'].isin(from_deletion)]

df_add.shape: (566800, 3)


In [196]:
df_add = pd.concat([df_add, df_train.loc[df_train['from'].isin(to_deletion) | df_train['to'].isin(to_deletion)]])
print('df_add.shape:', df_add.shape)
df_train = df_train.loc[~df_train['from'].isin(to_deletion) & ~df_train['to'].isin(to_deletion)]

df_add.shape: (670472, 3)


In [197]:
from sklearn.model_selection import train_test_split
add_val, add_test = train_test_split(df_add, test_size=0.5)

In [198]:
print('val - test|from:', len(set(add_val['from']).difference(set(add_test['from']))))
print('test - val|from:', len(set(add_test['from']).difference(set(add_val['from']))))
print('val - test|to:', len(set(add_val['to']).difference(set(add_test['to']))))
print('test - val|to:', len(set(add_test['to']).difference(set(add_val['to']))))

val - test|from: 1
test - val|from: 2
val - test|to: 0
test - val|to: 2


Как видно, лекарства распределились примерно равномерно 

In [199]:
df_val = pd.concat([df_val, add_val])
df_test = pd.concat([df_test, add_test])

## Сохранение

In [200]:
df_train.to_csv(f'{str(train_path)[:-4]}_new.csv', index=False)
df_val.to_csv(f'{str(val_path)[:-4]}_new.csv', index=False)
df_test.to_csv(f'{str(test_path)[:-4]}_new.csv', index=False)