<h1>Frequency selection on the lung cancer dataset</h1>

In [51]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
import networkx as nx
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

data = pd.read_csv("data/synthetic_data_lung_cancer.csv")
print(data.columns)

Index(['SUBJECT_ID', 'DEFINITION_ID', 'TIME'], dtype='object')


This function calculates the performance of a base-model on a passed data-set.

In [52]:
def calculateAUCWithBaseModel(data):
    label_encoder = LabelEncoder()
    data['DEFINITION_ID_encoded'] = label_encoder.fit_transform(data['DEFINITION_ID'])
    data['condition'] = data['DEFINITION_ID'].apply(lambda x: 'condition' in x)
    data['procedure'] = data['DEFINITION_ID'].apply(lambda x: 'procedure' in x)
    data['drug'] = data['DEFINITION_ID'].apply(lambda x: 'drug' in x)
    data['observation'] = data['DEFINITION_ID'].apply(lambda x: 'observation' in x)
    data['measurement'] = data['DEFINITION_ID'].apply(lambda x: 'measurement' in x)
    data.drop(columns=['DEFINITION_ID'], inplace=True)
    data['time_since_last'] = data.groupby('SUBJECT_ID')['TIME'].diff().fillna(0)
    columns_to_convert = ['condition', 'procedure', 'drug', 'observation', 'measurement']
    for column in columns_to_convert:
        data[column] = data[column].astype(int)
    time_threshold = 1 
    data['DEATH'] = data['time_since_last'].apply(lambda x: 1 if x <= time_threshold else 0)
    X = data[['SUBJECT_ID', 'TIME', 'DEFINITION_ID_encoded', 'condition', 'procedure', 'drug', 'observation', 'measurement']]
    y = data['DEATH']
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    classifier = RandomForestClassifier(class_weight='balanced', random_state=42)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"ROC AUC: {roc_auc}")
    return roc_auc

Here the frequency map of every DEFENITION_ID is created and a set of threshholds is iterated ofer in order to get the best model performance over a frequency-trimmed data-set.

In [53]:
# Calculate frequency of occurrence for each category in 'DEFINITION_ID' column
frequency_map = data['DEFINITION_ID'].value_counts(normalize=True).to_dict()

# Define thresholds for low and high frequencies
min_frequency = min(frequency_map.values())
max_frequency = max(frequency_map.values())

low_thresholds=[0,0.1,0.2,0.3]
best_data = None
best_AUC = 0
best_threshold = 0
low_threshold = 0
high_threshold = 1
for i in range(len(low_thresholds)):
    low_threshold = low_thresholds[i]
    
    low_threshold_normalized =  min_frequency + (max_frequency - min_frequency) * low_threshold
    high_threshold_normalized = min_frequency + (max_frequency - min_frequency) * high_threshold
    
    # Get categories below the low threshold and above the high threshold
    infrequent_categories = [category for category, freq in frequency_map.items() if freq < low_threshold_normalized]
    frequent_categories = [category for category, freq in frequency_map.items() if freq > high_threshold_normalized]
    
    # Filter rows based on frequency thresholds
    filtered_data = data[~data['DEFINITION_ID'].isin(infrequent_categories + frequent_categories)]
    new_AUC = calculateAUCWithBaseModel(filtered_data.copy())
    if(new_AUC>best_AUC):
        best_data=filtered_data
        best_threshold=low_threshold
        best_AUC=new_AUC
        print(len(filtered_data))
print(best_AUC)
print(best_threshold)


Accuracy: 0.9996880431391774
Precision: 0.9996879040527888
Recall: 1.0
ROC AUC: 0.7941176470588236
560971
Accuracy: 0.9995730588601826
Precision: 0.9995728222594239
Recall: 1.0
ROC AUC: 0.7823529411764706
Accuracy: 0.9995284882313211
Precision: 0.999528151501626
Recall: 1.0
ROC AUC: 0.8010752688172043
392351
Accuracy: 0.9991539035163196
Precision: 0.9991819512615173
Recall: 0.9999712738606495
ROC AUC: 0.7385177470220678
0.8010752688172043
0.2


The best data constellation is saved to a specific file. This path has to be used in workflow.ipynb .

In [41]:
#save filtered data
best_data.to_csv('data/bestFrequencyFiltered_t-low-'+str(low_threshold)+'_t-high-'+str(high_threshold)+'.csv')
data = best_data


Index(['condition', 'death', 'drug', 'measurement', 'observation',
       'procedure'],
      dtype='object', name='CATEGORY')


Index(['SUBJECT_ID', 'TIME', 'condition', 'death', 'drug', 'measurement',
       'observation', 'procedure'],
      dtype='object')
[2400, 419, 225, 491]
[False, False, False, False]


Unnamed: 0,SUBJECT_ID,TIME,condition,death,drug,measurement,observation,procedure
0,1,0.004807,-1,0,217,-1,-1,-1
1,1,0.008643,1922,0,-1,-1,-1,-1
2,1,0.027792,785,0,-1,-1,-1,-1
3,1,0.032515,-1,0,49,-1,-1,-1
4,1,0.056765,-1,0,-1,132,-1,-1


[263, 727]

0    0
1    0
2    0
3    0
4    0
Name: death, dtype: int32


Unnamed: 0,SUBJECT_ID,TIME,condition,drug,measurement,observation,procedure
0,1,0.004807,-1,217,-1,-1,-1
1,1,0.008643,1922,-1,-1,-1,-1
2,1,0.027792,785,-1,-1,-1,-1
3,1,0.032515,-1,49,-1,-1,-1
4,1,0.056765,-1,-1,132,-1,-1
