In [1]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sn

In [2]:
time_delta = 3

In [3]:
df = pd.read_csv('parsed_morphology.csv')
df = df.drop_duplicates()
df = df.loc[~(df['test_result']=='low positive')]
extra_cols = ['patient_id', 'test_date', 'morphology_date', 'test_identifier', 'distance', ]
feat_cols = ['Krwinki białe', 'Hemoglobina', 'MCV', 'MCHC', 'Płytki krwi', '% bazocytów', '% limfocytów', '% monocytów', '% eozynocytów', 'age_at_test', 'gender', 'test_result']
df = df[[*extra_cols, *feat_cols]]
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16515 entries, 0 to 16541
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   patient_id       16515 non-null  int64  
 1   test_date        16515 non-null  object 
 2   morphology_date  16515 non-null  object 
 3   test_identifier  16515 non-null  object 
 4   distance         16515 non-null  int64  
 5   Krwinki białe    16515 non-null  float64
 6   Hemoglobina      16514 non-null  float64
 7   MCV              16514 non-null  float64
 8   MCHC             16514 non-null  float64
 9   Płytki krwi      16515 non-null  int64  
 10  % bazocytów      16403 non-null  float64
 11  % limfocytów     16398 non-null  float64
 12  % monocytów      16398 non-null  float64
 13  % eozynocytów    16402 non-null  float64
 14  age_at_test      16515 non-null  float64
 15  gender           16515 non-null  object 
 16  test_result      16515 non-null  object 
dtypes: float64(9

In [4]:
df['test_date'] = pd.to_datetime(df['test_date']).dt.date
df['morphology_date'] = pd.to_datetime(df['morphology_date']).dt.date

df = df.loc[(df['test_date'] - df['morphology_date'] <= f"{time_delta} days") & (df['test_date'] - df['morphology_date'] >= f"- {time_delta} days")]
print(df['test_result'].value_counts())

negative        3568
positive         176
low-positive      20
Name: test_result, dtype: int64


In [5]:
# df_severity = df.loc[df['test_result']=='positive']
print(df['test_result'].value_counts())
print(df.loc[df['patient_id'].duplicated()])

negative        3568
positive         176
low-positive      20
Name: test_result, dtype: int64
       patient_id   test_date morphology_date test_identifier  distance  \
10            842  2020-11-07      2020-11-04      CN65774337         3   
13           1035  2020-11-07      2020-11-04      CN65962312         3   
46           7001  2020-10-13      2020-10-12      CN65371419         1   
51           8439  2020-10-01      2020-09-30      CN64941697         1   
189         61056  2020-10-16      2020-10-14      CN65390966         2   
...           ...         ...             ...             ...       ...   
16509     2154735  2020-12-04      2020-12-03      CN66728489         1   
16514     2154853  2020-12-02      2020-11-30      CN66650129         2   
16518     2155223  2020-11-29      2020-11-27      CN66600233         2   
16520     2155227  2020-12-04      2020-12-01      CN66658578         3   
16534     2155722  2020-12-07      2020-12-04      CN66758654         3   

    

In [6]:
# pd.concat(g for _, g in df.groupby("test_identifier") if len(g) > 1)

In [7]:
# These 6 rows are equal in all columns but test result. Test result is either negative or low positive.
# Due to the ambiguity it is better to drop this row.
df = df[~(df['test_identifier'].map(df['test_identifier'].value_counts())>1)]


In [8]:
# Map names to match Zenodo dataset
unify_cols_dict = {
    'Krwinki białe': 'WBC',
    'Hemoglobina': 'HGB',
    'MCV': 'MCV',
    'MCHC': 'MCHC',
    'Płytki krwi': 'PLT',
    '% bazocytów': 'BAT', # dlaczego nie Bat??
    '% limfocytów': 'LYT',
    '% monocytów': 'MOT',
    '% eozynocytów': 'EOT',
    'age_at_test': 'age',
    'gender': 'Sex',
    'test_result': 'target'
}
df = df.rename(columns=unify_cols_dict)

In [9]:
df_cbc = df[unify_cols_dict.values()]
print(df_cbc.columns)
print(df_cbc['target'].value_counts())

Index(['WBC', 'HGB', 'MCV', 'MCHC', 'PLT', 'BAT', 'LYT', 'MOT', 'EOT', 'age',
       'Sex', 'target'],
      dtype='object')
negative        3518
positive         175
low-positive      15
Name: target, dtype: int64


In [10]:
df_cbc.to_csv('datasets/uck_detection_3days.csv')

In [11]:
care_cat_dict = {
    'I': 1,
    'II': 2,
    'III': 3,
    'IV': 4
}
df_care = pd.read_csv('uck-data/KategoriaOpieki.csv')
df_care['KATEGORIA'] = df_care['KATEGORIA'].map(care_cat_dict)
df_care['OBOWIAZUJE_OD'] = pd.to_datetime(df_care['OBOWIAZUJE_OD']).dt.date
df_care['OBOWIAZUJE_DO'] = pd.to_datetime(df_care['OBOWIAZUJE_DO']).dt.date


df_care['OBOWIAZUJE_OD'] = df_care['OBOWIAZUJE_OD'].fillna(df_care['OBOWIAZUJE_DO'])
df_care['OBOWIAZUJE_DO'] = df_care['OBOWIAZUJE_DO'].fillna(df_care['OBOWIAZUJE_OD'])

In [12]:
# Wyświetlamy pacjentów hospitalizowanych kilka razy
group = df_care.groupby('PACJENT_ID')['HOSPITALIZACJA_ID'].unique()
group[group.apply(lambda x: len(x)>1)]

PACJENT_ID
82858                        [5279861, 5235349]
103678                       [5262870, 5267315]
228581              [5214060, 5294011, 5268595]
1224090                      [5276701, 5299561]
1454109             [5274015, 5292432, 5297532]
1734198                      [5257084, 5286933]
1869903                      [5266964, 5292300]
1871291                      [5283423, 5271934]
1907740                      [5257138, 5283466]
2020947                      [5276655, 5302696]
2022481                      [5244346, 5270147]
2084693                      [5252008, 5283205]
2085769             [5301723, 5267221, 5284465]
2101717                      [5249334, 5261130]
2118468             [5268522, 5276727, 5283552]
2138830                      [5295646, 5299427]
2140468    [5219377, 5251542, 5281123, 5286732]
2140596                      [5266809, 5271999]
2140598             [5300063, 5272298, 5276653]
2141925                      [5202899, 5247848]
2143753                      

In [13]:
# Sprawdzamy czy kat opieki zmieniła się w trakcie hospitalizacji
group = df_care.groupby('HOSPITALIZACJA_ID')['KATEGORIA'].unique()
care_cat = group.apply(lambda x: max(x))
arr = care_cat.values
df_care_post = pd.DataFrame({'HOSPITALIZACJA_ID': care_cat.index, 'Category': care_cat.values})

In [14]:
# sorted(df_care['OBOWIAZUJE_OD'] - df_care['OBOWIAZUJE_DO'], reverse=True)
group = df_care.groupby('HOSPITALIZACJA_ID')['OBOWIAZUJE_OD'].unique()
start_date = group.apply(lambda x: min(x))
group = df_care.groupby('HOSPITALIZACJA_ID')['OBOWIAZUJE_DO'].unique()
end_date = group.apply(lambda x: max(x))

df_buffer = pd.DataFrame({'HOSPITALIZACJA_ID': group.index, 'start_date': start_date, 'end_date': end_date})
df_care_post = df_care_post.join(df_buffer.set_index('HOSPITALIZACJA_ID'), on='HOSPITALIZACJA_ID')

In [15]:
df_buffer = df_care[['HOSPITALIZACJA_ID', 'PACJENT_ID']]
df_care_post = df_care_post.join(df_buffer.set_index('HOSPITALIZACJA_ID'), on='HOSPITALIZACJA_ID')
df_care_post = df_care_post.drop_duplicates()

In [16]:
# Szykamy poprawnych pacjent id...
df_morf = pd.read_csv('uck-data/tests/Morfologia.csv')
df_morf = df_morf[['pacjent_id', 'numer_badania']]

df_buffer = df.loc[df['target']=='positive']

df_buffer = df_buffer.rename(columns = {'test_identifier': 'numer_badania'})
df_buffer = df_buffer.join(df_morf.set_index('numer_badania'), on='numer_badania')
df_buffer = df_buffer.drop_duplicates()
print(len(df_buffer))

175


In [17]:
from IPython.display import clear_output


df_care_post['start_date'] = df_care_post['start_date'] - pd.Timedelta(days=time_delta)
df_care_post['end_date'] = df_care_post['end_date'] + pd.Timedelta(days=time_delta)
# NIE DZIAŁA
# df_care_post['date_range'] = pd.date_range(start=df_care_post['start_date'], end=df_care_post[end_date])


# clear_output(wait=True)


In [18]:
# df_care_post.loc[df_buffer['pacjent_id']==df_care_post['PACJENT_ID']]
ids = list(df_buffer['pacjent_id'].unique())

df_care_with_morf = df_care_post.loc[df_care_post['PACJENT_ID'].isin(ids)]
len(df_care_with_morf) #[df_care_with_morf['PACJENT_ID']==2140659].iloc[0]['PACJENT_ID'] == 2140659
# 2140659 in ids


56

In [19]:
df_buffer.loc[df_buffer['pacjent_id']==2140659]

Unnamed: 0,patient_id,test_date,morphology_date,numer_badania,distance,WBC,HGB,MCV,MCHC,PLT,BAT,LYT,MOT,EOT,age,Sex,target,pacjent_id


In [20]:

df_prognosis = pd.DataFrame()
for i, row in df_care_with_morf.iterrows():
    # rows = df_care_post.loc[df_care_post['PACJENT_ID'] == row['pacjent_id']]

    rows = rows.loc[(rows['morphology_date'] >= row['start_date']) & (rows['morphology_date'] <= row['end_date'])]
    if not rows.empty:
        print('non_empty')
        #     df_prognosis = df_prognosis.append(row)


print(df_prognosis.info())

NameError: name 'rows' is not defined