In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

lung_cancer_data = pd.read_csv('synthetic_data_lung_cancer.csv')

lung_cancer_data['condition'] = lung_cancer_data['DEFINITION_ID'].apply(lambda x: 'condition' in x)
lung_cancer_data['procedure'] = lung_cancer_data['DEFINITION_ID'].apply(lambda x: 'procedure' in x)
lung_cancer_data['drug'] = lung_cancer_data['DEFINITION_ID'].apply(lambda x: 'drug' in x)
lung_cancer_data['observation'] = lung_cancer_data['DEFINITION_ID'].apply(lambda x: 'observation' in x)
lung_cancer_data['measurement'] = lung_cancer_data['DEFINITION_ID'].apply(lambda x: 'measurement' in x)
lung_cancer_data.drop(columns=['DEFINITION_ID'], inplace=True)


In [2]:
max_time_per_patient = lung_cancer_data.groupby('SUBJECT_ID')['TIME'].max()
lung_cancer_data['DEATH'] = lung_cancer_data['SUBJECT_ID'].apply(lambda x: 1 if max_time_per_patient[x] < 1 else 0)
death_distribution = lung_cancer_data['DEATH'].value_counts(normalize=True)
print(death_distribution)

0    0.983156
1    0.016844
Name: DEATH, dtype: float64


In [3]:
pip install -U scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
scaler = StandardScaler()
features = lung_cancer_data.drop(['SUBJECT_ID', 'DEATH'], axis=1)  
features_scaled = scaler.fit_transform(features)
X_train, X_test, y_train, y_test = train_test_split(features_scaled, lung_cancer_data['DEATH'], test_size=0.2, random_state=42)
pca = PCA(n_components=0.95)  
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
rf_classifier = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_classifier.fit(X_train_pca, y_train)
y_pred = rf_classifier.predict(X_test_pca)
print("Classification Report:")
print(classification_report(y_test, y_pred))
auc_score = roc_auc_score(y_test, y_pred)
print(f"AUC Score: {auc_score}")


Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99    110304
           1       0.18      0.18      0.18      1891

    accuracy                           0.97    112195
   macro avg       0.58      0.58      0.58    112195
weighted avg       0.97      0.97      0.97    112195

AUC Score: 0.5839492600958812


In [9]:
lung_cancer_data.head()

Unnamed: 0,SUBJECT_ID,TIME,condition,procedure,drug,observation,measurement,DEATH
0,1,0.004807,False,False,True,False,False,0
1,1,0.008643,True,False,False,False,False,0
2,1,0.027792,True,False,False,False,False,0
3,1,0.032515,False,False,True,False,False,0
4,1,0.056765,False,False,False,False,True,0


In [13]:
display(lung_cancer_data.iloc[3702])

SUBJECT_ID            9
TIME           0.002243
condition         False
procedure         False
drug              False
observation       False
measurement        True
DEATH                 1
Name: 3702, dtype: object

In [12]:
death_rows = lung_cancer_data[lung_cancer_data['DEATH'] == 1]
display(death_rows)

Unnamed: 0,SUBJECT_ID,TIME,condition,procedure,drug,observation,measurement,DEATH
3698,9,0.000214,False,False,False,False,True,1
3699,9,0.000299,False,False,False,False,True,1
3700,9,0.001029,False,False,False,False,True,1
3701,9,0.001126,False,False,False,False,True,1
3702,9,0.002243,False,False,False,False,True,1
...,...,...,...,...,...,...,...,...
560966,984,0.027321,False,False,False,False,True,1
560967,984,0.028739,False,False,False,True,False,1
560968,984,0.030802,False,False,False,False,True,1
560969,984,0.035081,False,False,False,False,True,1
