In [1]:
import pandas as pd
import string

In [2]:
train = pd.read_csv('train_dataset_train.csv', sep=';', index_col=None, dtype={'PATIENT_SEX':str, 'MKB_CODE':str, 'ADRES':str, 'VISIT_MONTH_YEAR':str, 'AGE_CATEGORY':str, 'PATIENT_ID_COUNT':int})
test = pd.read_csv('test_dataset_test.csv', sep=';', index_col=None, dtype={'PATIENT_SEX':str, 'MKB_CODE':str, 'ADRES':str, 'VISIT_MONTH_YEAR':str, 'AGE_CATEGORY':str})

In [3]:
age2id = {
    "children": 0,
    "young": 1,
    "middleage": 2,
    "elderly": 3,
    "old": 4,
    "centenarians": 5
}
sex2id = {
    '0': 0,
    '1': 1
}
unique_city = sorted(train["ADRES"].unique())
city2ID = {city:i for i, city in enumerate(unique_city)}
mkb2id = {letter: i for i, letter in enumerate(string.ascii_uppercase)}

In [4]:
train["MONTH"] = train["VISIT_MONTH_YEAR"].apply(lambda x: int(x.split('.')[0]))
train["YEAR"] = train["VISIT_MONTH_YEAR"].apply(lambda x: int(x.split('.')[1]))
train["AGE"] = train["AGE_CATEGORY"].apply(lambda x: age2id[x])
train["SEX"] = train["PATIENT_SEX"].apply(lambda x: sex2id[x])
train["MKB1"] = train["MKB_CODE"].apply(lambda x: mkb2id[x[0]])
train["MKB2"] = train["MKB_CODE"].apply(lambda x: x.split('.')[0][1:])
train["MKB3"] = train["MKB_CODE"].apply(lambda x: x.split('.')[1] if len(x.split('.')) > 1 else -1)
train["CITY"] = train["ADRES"].apply(lambda x: city2ID[x])

test["MONTH"] = test["VISIT_MONTH_YEAR"].apply(lambda x: int(x.split('.')[0]))
test["YEAR"] = test["VISIT_MONTH_YEAR"].apply(lambda x: int(x.split('.')[1]))
test["AGE"] = test["AGE_CATEGORY"].apply(lambda x: age2id[x])
test["SEX"] = test["PATIENT_SEX"].apply(lambda x: sex2id[x])
test["MKB1"] = test["MKB_CODE"].apply(lambda x: mkb2id[x[0]])
test["MKB2"] = test["MKB_CODE"].apply(lambda x: x.split('.')[0][1:])
test["MKB3"] = test["MKB_CODE"].apply(lambda x: x.split('.')[1] if len(x.split('.')) > 1 else -1)
test["CITY"] = test["ADRES"].apply(lambda x: city2ID[x])

In [5]:
test['PATIENT_ID_COUNT'] = [0]*len(test)

In [6]:
def prev_date(date):
    month, year = list(map(int, date.split('.')))
    if month == 1:
        return f"12.{year-1}"
    else:
        return f"{month-1:02d}.{year}"

In [7]:
!mkdir train

mkdir: cannot create directory ‘train’: File exists


In [19]:
start_date = "04.22"
for z in range(45):
    start_date = prev_date(start_date)
    date = start_date
    new_train = train
    new_test = train[train["VISIT_MONTH_YEAR"] == date]

    for i in range(1, 26):
        date = prev_date(date)
        bufer = new_train[new_train["VISIT_MONTH_YEAR"] == date][['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', "PATIENT_ID_COUNT"]]
        new_test = new_test.merge(bufer, how="left", on=['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY'], suffixes=("", f"_prev{i}")).fillna(0)
     
    for i in range(1, 25):
        new_test[f"diff{i}-{i+1}"] = new_test[f"PATIENT_ID_COUNT_prev{i}"] - new_test[f"PATIENT_ID_COUNT_prev{i+1}"]
        
    new_test["diff1-12"] = new_test[f"PATIENT_ID_COUNT_prev1"] - new_test[f"PATIENT_ID_COUNT_prev12"]
    new_test["diff1-13"] = new_test[f"PATIENT_ID_COUNT_prev1"] - new_test[f"PATIENT_ID_COUNT_prev13"]
    new_test["diff2-12"] = new_test[f"PATIENT_ID_COUNT_prev2"] - new_test[f"PATIENT_ID_COUNT_prev12"]
    new_test["diff2-13"] = new_test[f"PATIENT_ID_COUNT_prev2"] - new_test[f"PATIENT_ID_COUNT_prev13"]
    new_test["diff1-6"] = new_test[f"PATIENT_ID_COUNT_prev1"] - new_test[f"PATIENT_ID_COUNT_prev6"]
    new_test["diff1-3"] = new_test[f"PATIENT_ID_COUNT_prev1"] - new_test[f"PATIENT_ID_COUNT_prev3"]
    new_test.to_csv(f"train/train{z}.csv", sep=";", index=False)

In [8]:
date = "04.22"
for i in range(1, 26):
    date = prev_date(date)
    bufer = train[train["VISIT_MONTH_YEAR"] == date][['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY', "PATIENT_ID_COUNT"]]
    test = test.merge(bufer, how="left", on=['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY'], suffixes=("", f"_prev{i}")).fillna(0)

for i in range(1, 25):
    test[f"diff{i}-{i+1}"] = test[f"PATIENT_ID_COUNT_prev{i}"] - test[f"PATIENT_ID_COUNT_prev{i+1}"]
test["diff1-12"] = test[f"PATIENT_ID_COUNT_prev1"] - test[f"PATIENT_ID_COUNT_prev12"]
test["diff1-13"] = test[f"PATIENT_ID_COUNT_prev1"] - test[f"PATIENT_ID_COUNT_prev13"]
test["diff2-12"] = test[f"PATIENT_ID_COUNT_prev2"] - test[f"PATIENT_ID_COUNT_prev12"]
test["diff2-13"] = test[f"PATIENT_ID_COUNT_prev2"] - test[f"PATIENT_ID_COUNT_prev13"]
test["diff1-6"] = test[f"PATIENT_ID_COUNT_prev1"] - test[f"PATIENT_ID_COUNT_prev6"]
test["diff1-3"] = test[f"PATIENT_ID_COUNT_prev1"] - test[f"PATIENT_ID_COUNT_prev3"]

In [9]:
test.to_csv("prepared_test.csv", sep=";", index=False)