In [40]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
import networkx as nx
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

data = pd.read_csv("data/synthetic_data_lung_cancer.csv")
print(data.columns)

Index(['SUBJECT_ID', 'DEFINITION_ID', 'TIME'], dtype='object')


In [30]:
def calculateAUCWithBaseModel(data):
    label_encoder = LabelEncoder()
    data['DEFINITION_ID_encoded'] = label_encoder.fit_transform(data['DEFINITION_ID'])
    data['condition'] = data['DEFINITION_ID'].apply(lambda x: 'condition' in x)
    data['procedure'] = data['DEFINITION_ID'].apply(lambda x: 'procedure' in x)
    data['drug'] = data['DEFINITION_ID'].apply(lambda x: 'drug' in x)
    data['observation'] = data['DEFINITION_ID'].apply(lambda x: 'observation' in x)
    data['measurement'] = data['DEFINITION_ID'].apply(lambda x: 'measurement' in x)
    data.drop(columns=['DEFINITION_ID'], inplace=True)
    data['time_since_last'] = data.groupby('SUBJECT_ID')['TIME'].diff().fillna(0)
    columns_to_convert = ['condition', 'procedure', 'drug', 'observation', 'measurement']
    for column in columns_to_convert:
        data[column] = data[column].astype(int)
    time_threshold = 1 
    data['DEATH'] = data['time_since_last'].apply(lambda x: 1 if x <= time_threshold else 0)
    X = data[['SUBJECT_ID', 'TIME', 'DEFINITION_ID_encoded', 'condition', 'procedure', 'drug', 'observation', 'measurement']]
    y = data['DEATH']
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    classifier = RandomForestClassifier(class_weight='balanced', random_state=42)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"ROC AUC: {roc_auc}")
    return roc_auc

In [36]:
# Calculate frequency of occurrence for each category in 'DEFINITION_ID' column
frequency_map = data['DEFINITION_ID'].value_counts(normalize=True).to_dict()

# Define thresholds for low and high frequencies
min_frequency = min(frequency_map.values())
max_frequency = max(frequency_map.values())

low_thresholds=[0,0.1,0.2,0.3]
best_data = None
best_AUC = 0
best_threshold = 0
low_threshold = 0
high_threshold = 1
for i in range(len(low_thresholds)):
    low_threshold = low_thresholds[i]
    
    low_threshold_normalized =  min_frequency + (max_frequency - min_frequency) * low_threshold
    high_threshold_normalized = min_frequency + (max_frequency - min_frequency) * high_threshold
    
    # Get categories below the low threshold and above the high threshold
    infrequent_categories = [category for category, freq in frequency_map.items() if freq < low_threshold_normalized]
    frequent_categories = [category for category, freq in frequency_map.items() if freq > high_threshold_normalized]
    
    # Filter rows based on frequency thresholds
    filtered_data = data[~data['DEFINITION_ID'].isin(infrequent_categories + frequent_categories)]
    new_AUC = calculateAUCWithBaseModel(filtered_data.copy())
    if(new_AUC>best_AUC):
        best_data=filtered_data
        best_threshold=low_threshold
        best_AUC=new_AUC
        print(len(filtered_data))
print(best_AUC)
print(best_threshold)


Accuracy: 0.9996880431391774
Precision: 0.9996879040527888
Recall: 1.0
ROC AUC: 0.7941176470588236
560971
Accuracy: 0.9995730588601826
Precision: 0.9995728222594239
Recall: 1.0
ROC AUC: 0.7823529411764706
Accuracy: 0.9995284882313211
Precision: 0.999528151501626
Recall: 1.0
ROC AUC: 0.8010752688172043
392351
Accuracy: 0.9991539035163196
Precision: 0.9991819512615173
Recall: 0.9999712738606495
ROC AUC: 0.7385177470220678
0.8010752688172043
0.2


In [28]:
#save filtered data
#filtered_data.to_csv('data/frequencyFiltered_t-low-'+str(low_threshold)+'_t-high-'+str(high_threshold)+'.csv')
#data = best_data


In [41]:
# Splitting 'DEFINITION_ID' into multiple columns
new_cols = data['DEFINITION_ID'].str.split('_', expand=True)

# Naming the new columns
new_cols.columns = ['CATEGORY', 'CATEGORY_ID']

# Pivoting the DataFrame
pivot_new_cols = new_cols.pivot_table(index=data.index, columns='CATEGORY', values='CATEGORY_ID', aggfunc=lambda x: ', '.join(str(v) for v in x)).fillna('-1')

print(pivot_new_cols.columns)

Index(['condition', 'death', 'drug', 'measurement', 'observation',
       'procedure'],
      dtype='object', name='CATEGORY')


In [42]:
# Concatenating the new columns to the original DataFrame
df = pd.concat([data, pivot_new_cols], axis=1)

# Dropping the original 'DEFINITION_ID' column
df.drop(columns=['DEFINITION_ID'], inplace=True)

# Filling the death column 
df['death'] = df['death'].replace('None', '1')
df['death'] = df['death'].replace('-1', '0')
# Converting column to boolean dtype
df['death'] = df['death'].astype(int)

print(df.columns)
#df.loc[(df['condition']=='')&(df['drug']=='')&(df['measurement']=='')&(df['observation']=='')&(df['procedure']=='')]
print([len(df['condition'].unique()),
len(df['drug'].unique()),
len(df['observation'].unique()),
len(df['procedure'].unique())])

print(['0' in (df['condition'].unique()),
'0' in (df['drug'].unique()),
'0' in (df['observation'].unique()),
'0' in (df['procedure'].unique())])

df.head()

Index(['SUBJECT_ID', 'TIME', 'condition', 'death', 'drug', 'measurement',
       'observation', 'procedure'],
      dtype='object')
[2400, 419, 225, 491]
[False, False, False, False]


Unnamed: 0,SUBJECT_ID,TIME,condition,death,drug,measurement,observation,procedure
0,1,0.004807,-1,0,217,-1,-1,-1
1,1,0.008643,1922,0,-1,-1,-1,-1
2,1,0.027792,785,0,-1,-1,-1,-1
3,1,0.032515,-1,0,49,-1,-1,-1
4,1,0.056765,-1,0,-1,132,-1,-1


In [50]:
#dummies for categorical analysis

#df = pd.get_dummies(df,columns=['condition','drug','measurement','observation','procedure'])
#df.head()
[len(df.loc[df['death']==1]),len(df.loc[df['death']==0, 'SUBJECT_ID'].unique())]


[263, 727]

In [5]:
from sklearn.model_selection import train_test_split

# First, create a list of unique SUBJECT_IDs
unique_subject_ids = df['SUBJECT_ID'].unique()

# Then, split the unique_subject_ids into train and test IDs
train_ids, test_ids = train_test_split(unique_subject_ids, test_size=0.2, random_state=2)

# Filter the main DataFrame into train and test sets based on the selected IDs
train_data = df[df['SUBJECT_ID'].isin(train_ids)]
test_data = df[df['SUBJECT_ID'].isin(test_ids)]

# Separate the features (X) and target variable (y) for train and test sets
X_train = train_data.drop('death',axis=1)#.drop(['condition_-1','drug_-1','measurement_-1','observation_-1','procedure_-1'],axis=1)  # Drop 'SUBJECT_ID' and target column
y_train = pd.Series(train_data.death)

X_test = test_data.drop('death',axis=1)#.drop(['condition_-1','drug_-1','measurement_-1','observation_-1','procedure_-1'],axis=1)  # Drop 'SUBJECT_ID' and target column
y_test = pd.Series(test_data.death)
#old way:
#X = df.drop('death',axis=1).drop(['condition_-1','drug_-1','measurement_-1','observation_-1','procedure_-1'],axis=1)
#y = pd.Series(df.death)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

print(y_train.head())
X_train.head()

0    0
1    0
2    0
3    0
4    0
Name: death, dtype: int32


Unnamed: 0,SUBJECT_ID,TIME,condition,drug,measurement,observation,procedure
0,1,0.004807,-1,217,-1,-1,-1
1,1,0.008643,1922,-1,-1,-1,-1
2,1,0.027792,785,-1,-1,-1,-1
3,1,0.032515,-1,49,-1,-1,-1
4,1,0.056765,-1,-1,132,-1,-1


In [6]:
#no sampling

In [7]:
#feature selection

from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression  # Replace with an appropriate estimator
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import SelectKBest, f_classif
from scipy.sparse import csr_matrix

#X_train_sparse = csr_matrix(X_train)

# Initialize your estimator 
lr=LogisticRegression(penalty='l1',solver='liblinear')
svc=SVC(kernel="linear")
nb=GaussianNB()

estimator = lr  #change to whatever works best

# Initialize RFECV with the estimator and scoring method
rfecv=RFECV(estimator=estimator, cv=5)
kbest=SelectKBest(score_func=f_classif, k=1000)
selector = kbest

# Fit the selector on your training data
X_train_selected = selector.fit_transform(X_train, y_train)


In [8]:

# Get selected feature indices
selected_feature_indices = selector.get_support(indices=True)

# Get the names of selected features
selected_feature_names = X_train.columns[selected_feature_indices]

# Create DataFrame with selected features
X_train_selected_df = X_train[selected_feature_names]

X_train_selected_df.columns
X_train_selected_df.head()

# no scaling necessary

In [9]:
#save most important features to csv

In [10]:
# do other funstuff