In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import re
from collections import defaultdict
from sklearn.impute import SimpleImputer
import seaborn as sns

# DATA SET CREATION

### 1. dataset extraction

In [7]:
with open ('/home/ofeksh2@mta.ac.il/config.json', 'r', encoding='utf-8') as f:
    config = json.load(f)

biobank_path = config['biobank_path']
#features_code_lists = config['features_code_lists']
features_name_list = config['features_name_list']
train_path = '/tmp/pycharm_project_366/train_data.csv'

In [22]:
train_df = pd.read_csv(train_path, low_memory=False)

In [25]:
features_code_dict = {}
features_with_array = defaultdict(lambda: 1)
with open('/tmp/pycharm_project_366/features_with_arrays.txt') as f:
    for line in f:
        feature_code, array_size = line.split()
        features_with_array[feature_code] = int(array_size)

with open('/tmp/pycharm_project_366/features.txt') as features_file:
    for line in features_file:
        feature_code, feature_name = line.split('\t')
        feature_name = feature_name.replace('\n', '')
        size = features_with_array[feature_code]
        for i in range(size):
            new_feature_code = f'{feature_code}-0.{i}'
            new_feature_name = f'{feature_name} - {i}'
            features_code_dict[new_feature_code] = new_feature_name

In [11]:
features_code_dict

{'eid-0.0': 'eid - 0',
 '41270-0.0': 'Diagnoses - ICD10 - 0',
 '41270-0.1': 'Diagnoses - ICD10 - 1',
 '41270-0.2': 'Diagnoses - ICD10 - 2',
 '41270-0.3': 'Diagnoses - ICD10 - 3',
 '41270-0.4': 'Diagnoses - ICD10 - 4',
 '41270-0.5': 'Diagnoses - ICD10 - 5',
 '41270-0.6': 'Diagnoses - ICD10 - 6',
 '41270-0.7': 'Diagnoses - ICD10 - 7',
 '41270-0.8': 'Diagnoses - ICD10 - 8',
 '41270-0.9': 'Diagnoses - ICD10 - 9',
 '41270-0.10': 'Diagnoses - ICD10 - 10',
 '41270-0.11': 'Diagnoses - ICD10 - 11',
 '41270-0.12': 'Diagnoses - ICD10 - 12',
 '41270-0.13': 'Diagnoses - ICD10 - 13',
 '41270-0.14': 'Diagnoses - ICD10 - 14',
 '41270-0.15': 'Diagnoses - ICD10 - 15',
 '41270-0.16': 'Diagnoses - ICD10 - 16',
 '41270-0.17': 'Diagnoses - ICD10 - 17',
 '41270-0.18': 'Diagnoses - ICD10 - 18',
 '41270-0.19': 'Diagnoses - ICD10 - 19',
 '41270-0.20': 'Diagnoses - ICD10 - 20',
 '41270-0.21': 'Diagnoses - ICD10 - 21',
 '41270-0.22': 'Diagnoses - ICD10 - 22',
 '41270-0.23': 'Diagnoses - ICD10 - 23',
 '41270-0.24'

In [26]:
train_df = train_df.rename(columns=features_code_dict)

In [27]:
train_df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72962 entries, 0 to 72961
Data columns (total 283 columns):
 #    Column                                                                                             Non-Null Count  Dtype 
---   ------                                                                                             --------------  ----- 
 0    Yead of birth - 0                                                                                  72961 non-null  object
 1    Duration of walks - 0                                                                              67797 non-null  object
 2    Number of days/week of moderate physical activity 10+ minutes - 0                                  72779 non-null  object
 3    Duration of moderate activity - 0                                                                  56369 non-null  object
 4    Number of days/week of vigorous physical activity 10+ minutes - 0                                  72779 non-null  o

###     1.1 Get diabetes dataset

In [38]:
def get_people_with_disease(df, all_diseased_column, disease_code_pattern):
    people_with_disease_df = df[all_diseased_column.str.contains(disease_code_pattern)]

    return people_with_disease_df

In [40]:
diabetes_pattern = r'E11'
people_with_diabetes_df = get_people_with_disease(df_filled, all_diseased_column, diabetes_pattern)
people_with_diabetes_df.shape

52177

###    1.2 Get pancreatic cancer dataset

In [4]:
def get_people_with_2_diseases(df, first_disease_code_pattern, second_disease_code_pattern):
    people_with_disease_df = df[df['41202-0.0'].str.contains(first_disease_code_pattern) & df['41204-0.0'].str.contains(second_disease_code_pattern)]
    people_with_disease_df = pd.concat([people_with_disease_df, df[df['41204-0.0'].str.contains(first_disease_code_pattern) & df['41202-0.0'].str.contains(second_disease_code_pattern)]])

    return people_with_disease_df

In [61]:
sum = 0
for num in range(10):
    pancreatic_cancer_pattern = r'C25' + re.escape(str(num))
    people_with_pancreatic_cancer_df = get_people_with_disease(df_filled, all_diseased_column,pancreatic_cancer_pattern)
    sum += people_with_pancreatic_cancer_df.shape[0]
sum

2851

###    1.3 Get patients with both diabetes and pancreatic cancer dataset

In [None]:
poeple_with_both_diseases = pd.merge([people_with_diabetes_df, people_with_pancreatic_cancer_df], how='inner')

###    1.4 Get test group dataset

In [None]:
test_group_df = df.sample(n=100000)
train_group_df = df.drop(test_group_df.index)

In [59]:
diabetes_pattern = r'E11'
people_with_diabetes_df = get_people_with_disease(test_group_df, all_diseased_column, diabetes_pattern)
print(people_with_diabetes_df.shape)

  people_with_disease_df = df[all_diseased_column.str.contains(disease_code_pattern)]


(8651, 82)


  people_with_disease_df = df[all_diseased_column.str.contains(disease_code_pattern)]


(382, 82)

In [None]:
pancreatic_cancer_pattern = r'C25'
people_with_pancreatic_cancer_df = get_people_with_disease(test_group_df, all_diseased_column,pancreatic_cancer_pattern)
people_with_pancreatic_cancer_df.shape

###    1.5 Merge and label datasets

In [55]:
people_with_diabetes_df['label'] = 1
people_with_pancreatic_cancer_df['label'] = 2
people_with_both_diseases['label'] = 3

KeyError: '20116-0.0'

# 2. FEATURE REPRESENTATION

### 2.1 Feature Preprocessing

todo!!

In [None]:
with open('/tmp/pycharm_project_366/features_types.json') as f:
    features_types = json.load(f)
numerical_features = features_types['numerical_features']
categorical_features = features_types['categorical_features']

### 2.1.1 Fill nans for numerical values

In [None]:
mean_imputer = SimpleImputer(strategy='mean')
train_df[numerical_features] = mean_imputer.fit_transform(train_df[numerical_features])

### 2.1.2 Fill nans for categorical values

In [None]:
categorical_imputer = SimpleImputer(strategy='most_frequent')
train_df[categorical_features] = categorical_imputer.fit_transform(train_df[categorical_features])

### 2.1.3 Extract information from diagnoses

In [None]:
diseases_patterns = [
    ('Diabetes', r'E11'),
    ('Pancreatic Cancer', r'C25'),
    ('Obesity', r'E66'),
    ('Acute Pancreatitis', r'K85'),
    ('Alcoholic Liver Disease', r'K70'),
    ('Cirrhosis', r'K74'),
    ('Acute Hepatitis A', r'B15'),
    ('Acute Hepatitis B', r'B16'),
    ('Acute Hepatitis C', r'B171'),
    ('Toxic Liver Disease', r'K71'),
    ('Cushings Syndrome', r'E24'),
    ('Hyperthyroidism', r'E05'),
    ('Intestinal Malabsorption', r'K90'),
    ('Arterial Embolism and Thrombosis', r'I74')
]

In [None]:
def classify_disease(diseases_column, disease_pattern):
    return diseases_column.str.contains(disease_pattern)

In [None]:
patient_diseases = train_df['Diagnoses']
for disease, disease_pattern in diseases_patterns[:2]:
    train_df[f'Has {disease}'] = classify_disease(patient_diseases, disease_pattern)

In [None]:
train_df = train_df.drop(columns=['Diagnoses'])

### 2.1.4 Extract information from family

In [None]:
father_diagnosis_codes = [f'Illnesses of father - {i}' for i in range(10)]
mother_diagnosis_codes = [f'Illnesses of mother - {i}' for i in range(11)]
siblings_diagnosis_codes = [f'Illnesses of siblings - {i}' for i in range(12)]

In [None]:
father_diseases = train_df[father_diagnosis_codes]
mother_diseases = train_df[mother_diagnosis_codes]
siblings_diseases = train_df[siblings_diagnosis_codes]

In [None]:
for disease, disease_pattern in diseases_patterns:
    train_df[f'Father has {disease}'] = classify_disease(father_diseases, disease_pattern)

In [None]:
for disease, disease_pattern in diseases_patterns:
    train_df[f'Mother has {disease}'] = classify_disease(mother_diseases, disease_pattern)

In [None]:
for disease, disease_pattern in diseases_patterns:
    train_df[f'Siblings have {disease}'] = classify_disease(siblings_diseases, disease_pattern)

In [None]:
train_df = train_df.drop(columns=(father_diagnosis_codes + mother_diagnosis_codes + siblings_diagnosis_codes))

### 2.2 Feature analysis

### 2.2.1 feature plots

In [None]:
Label_dictionary = {0:'Control group', 1:'Pancreatic Cancer patients', 2:'Diabetes patients'}

def plot_categorical_feature(data):
    categorical_labels = data.iloc[:,1].unique()
    grouped_data = data.groupby([data.columns[0], data.colomns[1]]).size().unstack(fill_value=0)
    grouped_data = grouped_data.index.setnames(Label_dictionary)
    
    grouped_data.columns = categorical_labels
    
    grouped_data = grouped_data.reset_index()
    
    grouped_data.plot(x=data.columns[0], kind='bar', stacked=False, figsize=(10, 6), color=['skyblue', 'salmon'])
    
    plt.xlabel(data.colomns[1].capitalize())
    plt.ylabel('Number of People')
    plt.title(f'Number of People by {data.colomns[0].capitalize()} and {data.colomns[1].capitalize()}')
    plt.xticks(rotation=0)
    
    plt.legend(title='Category', labels=categorical_labels)
    plt.show()

In [None]:
def plot_continuous_feature(data, agg_func='mean'):
    
    grouped_data = data.groupby(data.columns[0])[data.colomns[1]].agg(agg_func).reset_index()
    
    grouped_data.columns = [data.columns[0].capitalize(), f'{agg_func.capitalize()} of {data.colomns[1].capitalize()}']
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x=data.colomns[1].capitalize(), y=f'{agg_func.capitalize()} of {data.colomns[1].capitalize()}', data=grouped_data, palette='viridis')
    
    plt.xlabel(data.columns[0].capitalize())
    plt.ylabel(f'{agg_func.capitalize()} of {data.colomns[1].capitalize()}')
    plt.title(f'{agg_func.capitalize()} of {data.colomns[1].capitalize()} by {data.columns[0].capitalize()}')
    
    plt.show()

### 2.2.1.1 Quality of life

In [None]:
categorical_features_to_plot = ['Smoking Status', 'Processed meat intake', 'Processed meat intake', 'Alcohol intake frequency']
continuous_features_to_plot = ['Time spent watching television (TV)', 'Time spent using computer', 'Duration of moderate activity','Overall quality of sleep in past month']

for feature in categorical_features_to_plot:
    plot_categorical_feature(train_group_df[['Label', feature]])
for feature in continuous_features_to_plot:
    plot_categorical_feature(train_group_df[['Label', feature]])

In [None]:


diabetes_smoking = people_with_diabetes_df['20116-0.0'].value_counts()
pancreatic_cancer_smoking = people_with_pancreatic_cancer_df['20116-0.0'].value_counts()

x_axis = np.arange(len(diabetes_smoking['20116-0.0']))

plt.bar(x_axis - 0.2, diabetes_smoking['counts'], 0.4, label='diabetes')
plt.bar(x_axis + 0.2, pancreatic_cancer_smoking['counts'], 0.4, label='pancreatic cancer')

plt.show()

In [None]:
people_with_diabetes_df['20116-0.0'].value_counts().plot(kind='bar')
plt.show()
people_with_pancreatic_cancer_df['20116-0.0'].value_counts().plot(kind='bar')
plt.show()

In [None]:
people_with_diabetes_and_pancreatic_cancer_df = get_people_with_2_diseases(df_filled, diabetes_pattern, pancreatic_cancer_pattern)
people_with_diabetes_and_pancreatic_cancer_df

In [None]:
temp = pd.read_csv(biobank_path, usecols=features_code_list)