In [1]:
import sys

import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import re
from collections import defaultdict
from sklearn.impute import SimpleImputer
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
import pickle

# DATA SET CREATION

### 1. dataset extraction

In [2]:
with open ('/tmp/pycharm_project_366/config.json', 'r', encoding='utf-8') as f:
    config = json.load(f)

#features_code_lists = config['features_code_lists']
features_name_list = config['features_name_list']
train_path = config['train_path']
test_path = config['test_path']

In [None]:
train_df = pd.read_csv(train_path, low_memory=False)

In [None]:
train_df.shape

In [3]:
features_code_dict = {}
features_with_array = defaultdict(lambda: 1)
with open('/tmp/pycharm_project_366/features_with_arrays.txt') as f:
    for line in f:
        feature_code, array_size = line.split()
        features_with_array[feature_code] = int(array_size)

with open('/tmp/pycharm_project_366/features.txt') as features_file:
    for line in features_file:
        feature_code, feature_name = line.split('\t')
        feature_name = feature_name.replace('\n', '')
        size = features_with_array[feature_code]
        for i in range(size):
            new_feature_code = f'{feature_code}-0.{i}'
            new_feature_name = f'{feature_name} - {i}'
            features_code_dict[new_feature_code] = new_feature_name

In [None]:
features_code_dict

In [None]:
train_df = train_df.rename(columns=features_code_dict)

In [None]:
train_df.info(verbose=True, show_counts=True)

# 2. FEATURE REPRESENTATION

### 2.1 Feature Preprocessing

todo!!

In [4]:
with open('features_types.json') as f:
    features_types = json.load(f)
numerical_features = [f'{feature} - 0' for feature in features_types['numerical_features']]
categorical_features = [f'{feature} - 0' for feature in features_types['categorical_features']]

### 2.1.1 Fill nans for numerical values

In [None]:
mean_imputer = SimpleImputer(strategy='mean')

In [None]:
train_df[numerical_features] = mean_imputer.fit_transform(train_df[numerical_features])

### 2.1.2 Fill nans for categorical values

In [None]:
categorical_imputer = SimpleImputer(strategy='most_frequent')
train_df[categorical_features] = categorical_imputer.fit_transform(train_df[categorical_features])

### 2.1.3 Extract information from diagnoses

In [5]:
diseases_patterns = [
    ('Diabetes', r'E11'),
    ('Pancreatic Cancer', r'C25'),
    ('Obesity', r'E66'),
    ('Acute Pancreatitis', r'K85'),
    ('Alcoholic Liver Disease', r'K70'),
    ('Cirrhosis', r'K74'),
    ('Acute Hepatitis A', r'B15'),
    ('Acute Hepatitis B', r'B16'),
    ('Acute Hepatitis C', r'B171'),
    ('Toxic Liver Disease', r'K71'),
    ('Cushings Syndrome', r'E24'),
    ('Hyperthyroidism', r'E05'),
    ('Intestinal Malabsorption', r'K90'),
    ('Arterial Embolism and Thrombosis', r'I74')
]

In [6]:
def classify_disease(diseases_column, disease_pattern):
    return diseases_column.str.contains(disease_pattern)

In [None]:
patient_diseases = train_df['Diagnoses']
for disease, disease_pattern in diseases_patterns[2:]:
    train_df[f'Has {disease}'] = classify_disease(patient_diseases, disease_pattern)

In [None]:
train_df = train_df.drop(columns=['Diagnoses'])
train_df = train_df.drop(columns=[f'Diagnoses - ICD10 - {i}' for i in range(100)])

In [None]:
train_df

### 2.1.4 Extract information from family

In [7]:
father_diagnosis_codes = [f'Illnesses of father - {i}' for i in range(10)]
mother_diagnosis_codes = [f'Illnesses of mother - {i}' for i in range(11)]
siblings_diagnosis_codes = [f'Illnesses of siblings - {i}' for i in range(12)]

In [None]:
father_diseases = train_df[father_diagnosis_codes].astype(str).agg(', '.join, axis=1)
mother_diseases = train_df[mother_diagnosis_codes].astype(str).agg(', '.join, axis=1)
siblings_diseases = train_df[siblings_diagnosis_codes].astype(str).agg(', '.join, axis=1)

In [None]:
for disease, disease_pattern in diseases_patterns:
    train_df[f'Father has {disease}'] = classify_disease(father_diseases, disease_pattern)

In [None]:
for disease, disease_pattern in diseases_patterns:
    train_df[f'Mother has {disease}'] = classify_disease(mother_diseases, disease_pattern)

In [None]:
for disease, disease_pattern in diseases_patterns:
    train_df[f'Siblings have {disease}'] = classify_disease(siblings_diseases, disease_pattern)

In [None]:
train_df = train_df.drop(columns=(father_diagnosis_codes + mother_diagnosis_codes + siblings_diagnosis_codes))

### 2.2 One Hot Encoding

In [None]:
one_hot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore').set_output(transform='pandas')
one_hot_encoding = one_hot_encoder.fit_transform(train_df[categorical_features])
train_df = pd.concat([train_df, one_hot_encoding], axis=1)
train_df = train_df.drop(columns=categorical_features)

In [None]:
train_df

### 2.2 Feature analysis

### 2.2.1 feature plots

In [None]:
Label_dictionary = {0:'Control group', 1:'Pancreatic Cancer patients', 2:'Diabetes patients'}

def plot_categorical_feature(data):
    categorical_labels = data.iloc[:,1].unique()
    grouped_data = data.groupby([data.columns[0], data.columns[1]]).size().unstack(fill_value=0)
    #grouped_data = grouped_data.index.setnames(Label_dictionary)
    
    grouped_data.columns = categorical_labels
    
    grouped_data = grouped_data.reset_index()
    
    grouped_data.plot(x=data.columns[0], kind='bar', stacked=False, figsize=(10, 6))
    
    plt.xlabel(data.columns[1].capitalize())
    plt.ylabel('Number of People')
    plt.title(f'Number of People by {data.columns[0].capitalize()} and {data.columns[1].capitalize()}')
    plt.xticks(rotation=0)
    
    plt.legend(title='Category', labels=categorical_labels)
    plt.show()

In [None]:
def plot_continuous_feature(data, agg_func='mean'):
    
    grouped_data = data.groupby(data.columns[0])[data.columns[1]].agg(agg_func).reset_index()
    
    grouped_data.columns = [data.columns[0].capitalize(), f'{agg_func.capitalize()} of {data.columns[1].capitalize()}']
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x=data.columns[1].capitalize(), y=f'{agg_func.capitalize()} of {data.columns[1].capitalize()}', data=grouped_data, palette='viridis')
    
    plt.xlabel(data.columns[0].capitalize())
    plt.ylabel(f'{agg_func.capitalize()} of {data.columns[1].capitalize()}')
    plt.title(f'{agg_func.capitalize()} of {data.columns[1].capitalize()} by {data.columns[0].capitalize()}')
    
    plt.show()

### 2.2.1.1 Quality of life

In [None]:
categorical_features_to_plot = ['Smoking Status', 'Processed meat intake', 'Processed meat intake', 'Alcohol intake frequency']
#continuous_features_to_plot = ['Time spent watching television (TV)', 'Time spent using computer', 'Duration of moderate activity','Overall quality of sleep in past month']

for feature in categorical_features_to_plot:
    plot_categorical_feature(train_df[['Label', f'{feature} - 0']])
#for feature in continuous_features_to_plot:
#    plot_categorical_feature(train_group_df[['Label', feature]])

In [None]:
train_df.info(verbose=True, show_counts=True)

# 3 Training the Model

In [None]:
y = train_df['Label']
x = train_df.drop(['Label', 'eid'], axis=1)

In [ ]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}
rf_model = RandomForestClassifier()
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

In [None]:
grid_search.fit(x,y)
best_model = grid_search.best_estimator_

# 4 Evaluate Model

In [8]:
test_df = pd.read_csv(test_path)

In [9]:
test_df = test_df.rename(columns=features_code_dict)

In [10]:
model = pickle.load(open('/tmp/pycharm_project_366/Models/Best_Model.pk1', 'rb'))
one_hot_encoder = pickle.load(open('/tmp/pycharm_project_366/Models/One_Hot_Encoder.pk1', 'rb'))
mean_imputer = pickle.load(open('/tmp/pycharm_project_366/Models/Mean_Imputer.pk1' , 'rb'))
categorical_imputer = pickle.load(open('/tmp/pycharm_project_366/Models/Categorical_Imputer.pk1', 'rb'))

In [11]:
test_df[numerical_features] = mean_imputer.transform(test_df[numerical_features])
del mean_imputer

In [12]:
test_df[categorical_features] = categorical_imputer.transform(test_df[categorical_features])
del categorical_imputer

In [13]:
patient_diseases = test_df['Diagnoses']
for disease, disease_pattern in diseases_patterns[2:]:
    test_df[f'Has {disease}'] = classify_disease(patient_diseases, disease_pattern)

  test_df[f'Has {disease}'] = classify_disease(patient_diseases, disease_pattern)
  test_df[f'Has {disease}'] = classify_disease(patient_diseases, disease_pattern)
  test_df[f'Has {disease}'] = classify_disease(patient_diseases, disease_pattern)
  test_df[f'Has {disease}'] = classify_disease(patient_diseases, disease_pattern)
  test_df[f'Has {disease}'] = classify_disease(patient_diseases, disease_pattern)
  test_df[f'Has {disease}'] = classify_disease(patient_diseases, disease_pattern)
  test_df[f'Has {disease}'] = classify_disease(patient_diseases, disease_pattern)
  test_df[f'Has {disease}'] = classify_disease(patient_diseases, disease_pattern)
  test_df[f'Has {disease}'] = classify_disease(patient_diseases, disease_pattern)
  test_df[f'Has {disease}'] = classify_disease(patient_diseases, disease_pattern)
  test_df[f'Has {disease}'] = classify_disease(patient_diseases, disease_pattern)
  test_df[f'Has {disease}'] = classify_disease(patient_diseases, disease_pattern)


In [14]:
test_df = test_df.drop(columns=['Diagnoses'])
test_df = test_df.drop(columns=[f'Diagnoses - ICD10 - {i}' for i in range(100)])
test_df

Unnamed: 0,eid,Year of birth - 0,Duration of walks - 0,Number of days/week of moderate physical activity 10+ minutes - 0,Duration of moderate activity - 0,Number of days/week of vigorous physical activity 10+ minutes - 0,Duration of vigorous activity - 0,Duration of strenuous sports - 0,Time spent watching television (TV) - 0,Time spent using computer - 0,...,Has Alcoholic Liver Disease,Has Cirrhosis,Has Acute Hepatitis A,Has Acute Hepatitis B,Has Acute Hepatitis C,Has Toxic Liver Disease,Has Cushings Syndrome,Has Hyperthyroidism,Has Intestinal Malabsorption,Has Arterial Embolism and Thrombosis
0,1000048,1939.0,40.00000,7.0,20.000000,1.0,60.000000,3.0,2.0,1.0,...,False,False,False,False,False,False,False,False,False,False
1,1000123,1944.0,30.00000,0.0,59.181126,0.0,40.099832,3.0,4.0,-10.0,...,False,False,False,False,False,False,False,False,False,False
2,1000131,1955.0,52.79135,0.0,59.181126,0.0,40.099832,3.0,4.0,2.0,...,False,False,False,False,False,False,False,False,False,False
3,1000219,1962.0,15.00000,0.0,59.181126,0.0,40.099832,3.0,2.0,-10.0,...,False,False,False,False,False,False,False,False,False,False
4,1000238,1950.0,20.00000,1.0,20.000000,0.0,40.099832,3.0,3.0,-10.0,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100761,6023700,1958.0,30.00000,2.0,10.000000,1.0,-1.000000,3.0,0.0,3.0,...,False,False,False,False,False,False,False,False,False,False
100762,6023778,1950.0,60.00000,1.0,20.000000,0.0,40.099832,3.0,3.0,1.0,...,False,False,False,False,False,False,False,False,False,False
100763,6023994,1965.0,20.00000,7.0,30.000000,3.0,20.000000,3.0,2.0,2.0,...,False,False,False,False,False,False,False,False,False,False
100764,6024004,1960.0,30.00000,3.0,300.000000,2.0,120.000000,4.0,1.0,0.0,...,False,False,False,False,False,False,False,False,False,False


In [15]:
father_diseases = test_df[father_diagnosis_codes].astype(str).agg(', '.join, axis=1)
mother_diseases = test_df[mother_diagnosis_codes].astype(str).agg(', '.join, axis=1)
siblings_diseases = test_df[siblings_diagnosis_codes].astype(str).agg(', '.join, axis=1)
for disease, disease_pattern in diseases_patterns:
    test_df[f'Father has {disease}'] = classify_disease(father_diseases, disease_pattern)
for disease, disease_pattern in diseases_patterns:
    test_df[f'Mother has {disease}'] = classify_disease(mother_diseases, disease_pattern)
for disease, disease_pattern in diseases_patterns:
    test_df[f'Siblings have {disease}'] = classify_disease(siblings_diseases, disease_pattern)
test_df = test_df.drop(columns=(father_diagnosis_codes + mother_diagnosis_codes + siblings_diagnosis_codes))

  test_df[f'Father has {disease}'] = classify_disease(father_diseases, disease_pattern)
  test_df[f'Father has {disease}'] = classify_disease(father_diseases, disease_pattern)
  test_df[f'Father has {disease}'] = classify_disease(father_diseases, disease_pattern)
  test_df[f'Father has {disease}'] = classify_disease(father_diseases, disease_pattern)
  test_df[f'Father has {disease}'] = classify_disease(father_diseases, disease_pattern)
  test_df[f'Father has {disease}'] = classify_disease(father_diseases, disease_pattern)
  test_df[f'Father has {disease}'] = classify_disease(father_diseases, disease_pattern)
  test_df[f'Father has {disease}'] = classify_disease(father_diseases, disease_pattern)
  test_df[f'Father has {disease}'] = classify_disease(father_diseases, disease_pattern)
  test_df[f'Father has {disease}'] = classify_disease(father_diseases, disease_pattern)
  test_df[f'Father has {disease}'] = classify_disease(father_diseases, disease_pattern)
  test_df[f'Father has {disease}

In [16]:
one_hot_encoding = one_hot_encoder.transform(test_df[categorical_features])
test_df = pd.concat([test_df, one_hot_encoding], axis=1)
test_df = test_df.drop(columns=categorical_features)

In [17]:
test_df

Unnamed: 0,eid,Year of birth - 0,Duration of walks - 0,Number of days/week of moderate physical activity 10+ minutes - 0,Duration of moderate activity - 0,Number of days/week of vigorous physical activity 10+ minutes - 0,Duration of vigorous activity - 0,Time spent watching television (TV) - 0,Time spent using computer - 0,Time spent driving - 0,...,Processed meat intake - 0_-3.0,Processed meat intake - 0_-1.0,Processed meat intake - 0_0.0,Processed meat intake - 0_1.0,Processed meat intake - 0_2.0,Processed meat intake - 0_3.0,Processed meat intake - 0_4.0,Processed meat intake - 0_5.0,Antigen assay QC indicator - 0_1.0,Antigen assay QC indicator - 0_2.0
0,1000048,1939.0,40.00000,7.0,20.000000,1.0,60.000000,2.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1000123,1944.0,30.00000,0.0,59.181126,0.0,40.099832,4.0,-10.0,-10.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1000131,1955.0,52.79135,0.0,59.181126,0.0,40.099832,4.0,2.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1000219,1962.0,15.00000,0.0,59.181126,0.0,40.099832,2.0,-10.0,-10.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1000238,1950.0,20.00000,1.0,20.000000,0.0,40.099832,3.0,-10.0,-10.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100761,6023700,1958.0,30.00000,2.0,10.000000,1.0,-1.000000,0.0,3.0,-10.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
100762,6023778,1950.0,60.00000,1.0,20.000000,0.0,40.099832,3.0,1.0,-10.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
100763,6023994,1965.0,20.00000,7.0,30.000000,3.0,20.000000,2.0,2.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
100764,6024004,1960.0,30.00000,3.0,300.000000,2.0,120.000000,1.0,0.0,-10.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [18]:
y_test = test_df['Label']
x_test = test_df.drop(columns=['Label', 'eid'])

In [23]:
print(model)

RandomForestClassifier(bootstrap=False, max_depth=20, min_samples_split=5,
                       n_estimators=300)


In [19]:
y_predictions = model.predict(x_test)

In [20]:
test_score = accuracy_score(y_test, y_predictions)
print(f'Accuracy on test set: {test_score}')

Accuracy on test set: 0.8556953734394538


In [21]:
confusion_matrix = confusion_matrix(y_test, y_predictions)
print(confusion_matrix)

[[79070     0 12659]
 [  265     0   124]
 [ 1493     0  7155]]


In [22]:
classification_report_str = classification_report(y_test, y_predictions, target_names=['Healthy', 'Pancreatic Cancer', 'T2D'])
print("Classification Report:\n", classification_report_str)

Classification Report:
                    precision    recall  f1-score   support

          Healthy       0.98      0.86      0.92     91729
Pancreatic Cancer       0.00      0.00      0.00       389
              T2D       0.36      0.83      0.50      8648

         accuracy                           0.86    100766
        macro avg       0.45      0.56      0.47    100766
     weighted avg       0.92      0.86      0.88    100766


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
