In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import re

# DATA SET CREATION

### 1. dataset extraction

In [4]:
with open ('/home/ofeksh2@mta.ac.il/config.json', 'r', encoding='utf-8') as f:
    config = json.load(f)

biobank_path = config['biobank_path']
features_code_list = config['features_code_list']
features_name_list = config['features_name_list']

In [32]:
diagnosises = ['eid', '31-0.0'] + [f'41270-0.{i}' for i in range(80)]

In [33]:
df = pd.read_csv(biobank_path, usecols=diagnosises)

  df = pd.read_csv(biobank_path, usecols=diagnosises)


In [18]:
difference = set(features_code_list) - set(df.columns)
features_dict = dict(zip(features_code_list, features_name_list))
[features_dict[feature] for feature in difference if 'Cholesterol' not in feature]

['Free Cholesterol in Medium LDL',
 'Cholesterol to Total Lipids in Small VLDL percentage',
 'Number of times woke up in the night during the past month',
 'Cholesterol to Total Lipids in Medium VLDL percentage',
 'Free Cholesterol in Very Small VLDL',
 'Cholesterol to Total Lipids in Very Small VLDL percentage',
 'Tiredness, weariness or fatigue goes away when resting',
 'Cholesterol to Total Lipids in Large HDL percentage',
 'Cholesteryl Esters in Medium HDL',
 'Free Cholesterol in Large HDL',
 'Free Cholesterol to Total Lipids in Medium LDL percentage',
 'Free Cholesterol in Very Large VLDL',
 'Cholesteryl Esters in VLDL',
 'Overall quality of sleep in past month',
 'Cholesteryl Esters in Very Small VLDL',
 'Free Cholesterol in LDL',
 'Free Cholesterol to Total Lipids in Medium VLDL percentage',
 'Free Cholesterol in VLDL',
 'Standard PRS for type 2 diabetes (T2D)',
 'Free Cholesterol to Total Lipids in Very Large HDL percentage',
 'Cholesterol in Medium HDL',
 'Cholesteryl Esters i

In [34]:
df_filled = df.fillna('-1')

In [35]:
df_filled

Unnamed: 0,eid,31-0.0,41270-0.0,41270-0.1,41270-0.2,41270-0.3,41270-0.4,41270-0.5,41270-0.6,41270-0.7,...,41270-0.70,41270-0.71,41270-0.72,41270-0.73,41270-0.74,41270-0.75,41270-0.76,41270-0.77,41270-0.78,41270-0.79
0,1000013,1,E669,E785,F171,I088,I211,I251,I252,I253,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,1000024,1,E669,I081,I10,I272,I420,I447,I509,I519,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,1000036,1,Z302,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,1000048,0,A099,A410,C504,C509,D059,D231,D649,D70,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,1000055,0,D300,D509,E039,I10,I252,I429,I447,I48,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502365,6024050,0,A099,C172,D123,D133,E875,H701,H71,I10,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
502366,6024062,1,E780,H258,H269,H438,H521,I10,I209,I251,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
502367,6024077,1,D122,D125,D128,F329,I849,K221,K573,K574,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
502368,6024085,1,H020,K297,K635,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [36]:
all_diseased_column = df_filled.drop(['eid', '31-0.0'], axis=1).agg(', '.join, axis=1)

In [ ]:
all_diseased_column

###     1.1 Get diabetes dataset

In [38]:
def get_people_with_disease(df, all_diseased_column, disease_code_pattern):
    people_with_disease_df = df[all_diseased_column.str.contains(disease_code_pattern)]

    return people_with_disease_df

In [40]:
diabetes_pattern = r'E11'
people_with_diabetes_df = get_people_with_disease(df_filled, all_diseased_column, diabetes_pattern)
people_with_diabetes_df.shape

52177

###    1.2 Get pancreatic cancer dataset

In [4]:
def get_people_with_2_diseases(df, first_disease_code_pattern, second_disease_code_pattern):
    people_with_disease_df = df[df['41202-0.0'].str.contains(first_disease_code_pattern) & df['41204-0.0'].str.contains(second_disease_code_pattern)]
    people_with_disease_df = pd.concat([people_with_disease_df, df[df['41204-0.0'].str.contains(first_disease_code_pattern) & df['41202-0.0'].str.contains(second_disease_code_pattern)]])

    return people_with_disease_df

In [61]:
sum = 0
for num in range(10):
    pancreatic_cancer_pattern = r'C25' + re.escape(str(num))
    people_with_pancreatic_cancer_df = get_people_with_disease(df_filled, all_diseased_column,pancreatic_cancer_pattern)
    sum += people_with_pancreatic_cancer_df.shape[0]
sum

2851

###    1.3 Get patients with both diabetes and pancreatic cancer dataset

In [None]:
poeple_with_both_diseases = pd.merge([people_with_diabetes_df, people_with_pancreatic_cancer_df], how='inner')

###    1.4 Get test group dataset

In [ ]:
test_group_df = df.sample(n=100000)
train_group_df = df.drop(test_group_df.index)

In [59]:
diabetes_pattern = r'E11'
people_with_diabetes_df = get_people_with_disease(test_group_df, all_diseased_column, diabetes_pattern)
print(people_with_diabetes_df.shape)

  people_with_disease_df = df[all_diseased_column.str.contains(disease_code_pattern)]


(8651, 82)


  people_with_disease_df = df[all_diseased_column.str.contains(disease_code_pattern)]


(382, 82)

In [ ]:
pancreatic_cancer_pattern = r'C25'
people_with_pancreatic_cancer_df = get_people_with_disease(test_group_df, all_diseased_column,pancreatic_cancer_pattern)
people_with_pancreatic_cancer_df.shape

###    1.5 Merge and label datasets

In [55]:
people_with_diabetes_df['label'] = 1
people_with_pancreatic_cancer_df['label'] = 2
people_with_both_diseases['label'] = 3

KeyError: '20116-0.0'

# 2. FEATURE REPRESENTATION

### 2.1 Feature Preprocessing

todo!!

### 2.2 Feature analysis

In [None]:
diabetes_smoking = people_with_diabetes_df['20116-0.0'].value_counts()
pancreatic_cancer_smoking = people_with_pancreatic_cancer_df['20116-0.0'].value_counts()

x_axis = np.arange(len(diabetes_smoking['20116-0.0']))

plt.bar(x_axis - 0.2, diabetes_smoking['counts'], 0.4, label='diabetes')
plt.bar(x_axis + 0.2, pancreatic_cancer_smoking['counts'], 0.4, label='pancreatic cancer')

plt.show()

In [ ]:
people_with_diabetes_df['20116-0.0'].value_counts().plot(kind='bar')
plt.show()
people_with_pancreatic_cancer_df['20116-0.0'].value_counts().plot(kind='bar')
plt.show()

In [ ]:
people_with_diabetes_and_pancreatic_cancer_df = get_people_with_2_diseases(df_filled, diabetes_pattern, pancreatic_cancer_pattern)
people_with_diabetes_and_pancreatic_cancer_df

In [ ]:
temp = pd.read_csv(biobank_path, usecols=features_code_list)