In [None]:
import pandas as pd
import pyreadstat
from sklearn.impute import KNNImputer
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.svm import LinearSVC

In [2]:
# Read the SAS file
df, meta = pyreadstat.read_sas7bdat("respir.sas7bdat")

# Save to CSV
output_file = "resp_file.csv"
df.to_csv(output_file, index=False)

print("File successfully converted to CSV.")


File successfully converted to CSV.


In [3]:
df = pd.read_csv("resp_file.csv")
df

  df = pd.read_csv("resp_file.csv")


Unnamed: 0,CASENUM,REG,MAR_STAT,RACE,ORIGIN,NHIA,SEX,AGE_DX,YR_BRTH,SEQ_NUM,...,ADJM_6VALUE,ADJAJCCSTG,CS7SITE,CS9SITE,CS12SITE,HER2,BRST_SUB,PLC_BRTH_CNTRY,PLC_BRTH_STATE,ANNARBOR
0,1,1501,5,2,0,0,1,63,1922.0,2,...,,,,,,9,9,USA,TX,8
1,6,1501,1,11,0,0,1,47,1945.0,0,...,,,,,,9,9,LAO,XX,8
2,34,1501,1,1,0,0,1,57,1927.0,1,...,,,,,,9,9,USA,CA,8
3,39,1501,5,1,0,0,2,63,1944.0,2,...,,,,,,9,9,USA,CA,8
4,43,1501,2,1,0,0,2,39,1952.0,0,...,,,,,,9,9,USA,CA,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1058248,98849613,1547,5,1,0,0,1,80,1931.0,0,...,,,,,,9,9,USA,GA,8
1058249,98849622,1547,2,1,0,0,1,86,1925.0,0,...,,,,,,9,9,USA,GA,8
1058250,98849624,1547,5,1,0,0,2,106,1905.0,0,...,,,,,,9,9,USA,AL,8
1058251,98849630,1547,5,1,0,0,1,78,1928.0,0,...,,,,,,9,9,USA,GA,8


In [4]:

# Calculate missing value counts
missing_counts = df.isnull().sum()

# Filter columns with missing values and sort in descending order
missing_counts = missing_counts[missing_counts > 0].sort_values(ascending=False)

# Convert to DataFrame for better readability
missing_df = missing_counts.to_frame(name='Missing_Values')

# Display the result
display(missing_df)


Unnamed: 0,Missing_Values
CS12SITE,1058253
CS_SSF13,1058253
CS_SSF15,1058253
CS_SSF16,1058253
VASINV,1058253
...,...
HST_STGA,191921
EOD10_NE,183462
EOD10_PN,183462
IHS,167112


In [5]:
total_rows = len(df)

# Calculate missing percentage
missing_percent = (missing_counts / total_rows) * 100

# Combine into a summary DataFrame
missing_summary = pd.DataFrame({
    'Missing_Values': missing_counts,
    'Missing_%': missing_percent.round(2)
})

# Filter and sort columns with missing values
missing_summary = missing_summary[missing_summary['Missing_Values'] > 0]
missing_summary = missing_summary.sort_values(by='Missing_Values', ascending=False)

# Display
display(missing_summary)

Unnamed: 0,Missing_Values,Missing_%
CS12SITE,1058253,100.00
RECONST,1058253,100.00
CS_SSF11,1058253,100.00
CS9SITE,1058253,100.00
EOD10_PE,1058253,100.00
...,...,...
HST_STGA,191921,18.14
EOD10_NE,183462,17.34
EOD10_PN,183462,17.34
IHS,167112,15.79


In [6]:
total_rows = len(df)
missing_percent = (df.isnull().sum() / total_rows) * 100

# Step 2: Drop columns with more than 70% missing values
columns_to_drop = missing_percent[missing_percent > 70].index.tolist()
df_cleaned = df.drop(columns=columns_to_drop)
df_cleaned.head()


Unnamed: 0,CASENUM,REG,MAR_STAT,RACE,ORIGIN,NHIA,SEX,AGE_DX,YR_BRTH,SEQ_NUM,...,CSSCHEMA,SRV_TIME_MON,SRV_TIME_MON_FLAG,SRV_TIME_MON_PA,SRV_TIME_MON_FLAG_PA,HER2,BRST_SUB,PLC_BRTH_CNTRY,PLC_BRTH_STATE,ANNARBOR
0,1,1501,5,2,0,0,1,63,1922.0,2,...,47,84,1,84,1,9,9,USA,TX,8
1,6,1501,1,11,0,0,1,47,1945.0,0,...,47,147,1,147,1,9,9,LAO,XX,8
2,34,1501,1,1,0,0,1,57,1927.0,1,...,47,86,1,86,1,9,9,USA,CA,8
3,39,1501,5,1,0,0,2,63,1944.0,2,...,48,0,1,0,1,9,9,USA,CA,8
4,43,1501,2,1,0,0,2,39,1952.0,0,...,47,20,1,20,1,9,9,USA,CA,8


In [7]:
# Updated column list — no TNM or overall staging fields
columns_to_keep = [
    'AGE_DX', 'SEX', 'RACE','YR_BRTH',           
    'GRADE', 'HISTO3V', 'BEHO3V',                                   
    'TUMOR_1V', 'TUMOR_2V', 'TUMOR_3V',                              # Tumor groupings
                                          
    'SURGPRIM', 'NO_SURG', 'SS_SURG',                               # Surgery
    'RADIATN', 'RAD_SURG',                                          # Radiation
    'TYPEFUP', 'NUMPRIMS', 'FIRSTPRM',                              # Follow-up, recurrence
    'STAT_REC', 'DTH_CLASS', 'O_DTH_CLASS',                         # Death status
    'EXTEVAL', 'NODEEVAL', 'METSEVAL',                              # Disease evaluation status
    'INTPRIM', 'ICDOT10V',                                          # Coding, integration
    'SRV_TIME_MON'                                                 # Survival duration for target variable
]

# Keep only these columns in your DataFrame (replace `resp_df` with your actual DataFrame name)
resp_df = df_cleaned[columns_to_keep]


In [8]:
resp_df


Unnamed: 0,AGE_DX,SEX,RACE,YR_BRTH,GRADE,HISTO3V,BEHO3V,TUMOR_1V,TUMOR_2V,TUMOR_3V,...,FIRSTPRM,STAT_REC,DTH_CLASS,O_DTH_CLASS,EXTEVAL,NODEEVAL,METSEVAL,INTPRIM,ICDOT10V,SRV_TIME_MON
0,63,1,2,1922.0,9,8140,3,9.0,9.0,9.0,...,0,4,9,9,,,,1,C341,84
1,47,1,11,1945.0,3,8140,3,9.0,9.0,9.0,...,1,4,0,1,,,,1,C343,147
2,57,1,1,1927.0,2,8140,3,9.0,9.0,9.0,...,1,4,1,0,,,,1,C343,86
3,63,2,1,1944.0,1,8801,3,,,,...,0,4,9,9,3.0,0.0,0.0,1,C381,0
4,39,2,1,1952.0,1,8012,3,9.0,9.0,9.0,...,1,4,1,0,,,,1,C343,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1058248,80,1,1,1931.0,9,8000,3,,,,...,1,4,1,0,9.0,9.0,9.0,1,C349,9999
1058249,86,1,1,1925.0,9,8000,3,,,,...,1,4,1,0,9.0,9.0,9.0,1,C349,9999
1058250,106,2,1,1905.0,9,8000,3,,,,...,1,4,1,0,9.0,9.0,9.0,1,C349,9999
1058251,78,1,1,1928.0,9,8000,3,,,,...,1,4,1,0,0.0,0.0,0.0,1,C341,42


In [9]:
columns_to_drop = ['HER2', 'BRST_SUB', 'PLC_BRTH_CNTRY', 'PLC_BRTH_STATE', 'ANNARBOR']

# Drop those columns from your cleaned DataFrame
resp_df = resp_df.drop(columns=columns_to_drop, errors='ignore')


In [10]:
resp_df

Unnamed: 0,AGE_DX,SEX,RACE,YR_BRTH,GRADE,HISTO3V,BEHO3V,TUMOR_1V,TUMOR_2V,TUMOR_3V,...,FIRSTPRM,STAT_REC,DTH_CLASS,O_DTH_CLASS,EXTEVAL,NODEEVAL,METSEVAL,INTPRIM,ICDOT10V,SRV_TIME_MON
0,63,1,2,1922.0,9,8140,3,9.0,9.0,9.0,...,0,4,9,9,,,,1,C341,84
1,47,1,11,1945.0,3,8140,3,9.0,9.0,9.0,...,1,4,0,1,,,,1,C343,147
2,57,1,1,1927.0,2,8140,3,9.0,9.0,9.0,...,1,4,1,0,,,,1,C343,86
3,63,2,1,1944.0,1,8801,3,,,,...,0,4,9,9,3.0,0.0,0.0,1,C381,0
4,39,2,1,1952.0,1,8012,3,9.0,9.0,9.0,...,1,4,1,0,,,,1,C343,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1058248,80,1,1,1931.0,9,8000,3,,,,...,1,4,1,0,9.0,9.0,9.0,1,C349,9999
1058249,86,1,1,1925.0,9,8000,3,,,,...,1,4,1,0,9.0,9.0,9.0,1,C349,9999
1058250,106,2,1,1905.0,9,8000,3,,,,...,1,4,1,0,9.0,9.0,9.0,1,C349,9999
1058251,78,1,1,1928.0,9,8000,3,,,,...,1,4,1,0,0.0,0.0,0.0,1,C341,42


In [11]:
resp_df = resp_df.dropna(subset = ['YR_BRTH'])
resp_df

Unnamed: 0,AGE_DX,SEX,RACE,YR_BRTH,GRADE,HISTO3V,BEHO3V,TUMOR_1V,TUMOR_2V,TUMOR_3V,...,FIRSTPRM,STAT_REC,DTH_CLASS,O_DTH_CLASS,EXTEVAL,NODEEVAL,METSEVAL,INTPRIM,ICDOT10V,SRV_TIME_MON
0,63,1,2,1922.0,9,8140,3,9.0,9.0,9.0,...,0,4,9,9,,,,1,C341,84
1,47,1,11,1945.0,3,8140,3,9.0,9.0,9.0,...,1,4,0,1,,,,1,C343,147
2,57,1,1,1927.0,2,8140,3,9.0,9.0,9.0,...,1,4,1,0,,,,1,C343,86
3,63,2,1,1944.0,1,8801,3,,,,...,0,4,9,9,3.0,0.0,0.0,1,C381,0
4,39,2,1,1952.0,1,8012,3,9.0,9.0,9.0,...,1,4,1,0,,,,1,C343,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1058248,80,1,1,1931.0,9,8000,3,,,,...,1,4,1,0,9.0,9.0,9.0,1,C349,9999
1058249,86,1,1,1925.0,9,8000,3,,,,...,1,4,1,0,9.0,9.0,9.0,1,C349,9999
1058250,106,2,1,1905.0,9,8000,3,,,,...,1,4,1,0,9.0,9.0,9.0,1,C349,9999
1058251,78,1,1,1928.0,9,8000,3,,,,...,1,4,1,0,0.0,0.0,0.0,1,C341,42


In [12]:
# Make a copy of the DataFrame to avoid altering the original
df_encoded = resp_df.copy()

# Identify columns you want to impute
categorical_like_cols = ['TUMOR_1V', 'TUMOR_2V', 'TUMOR_3V', 'SURGPRIM', 'EXTEVAL', 'NODEEVAL', 'METSEVAL','SS_SURG']

# Apply label encoding (since KNNImputer works with numeric data)
encoders = {}
for col in categorical_like_cols:
    if col in df_encoded.columns:
        le = LabelEncoder()
        # Only fit non-null values
        df_encoded[col] = df_encoded[col].astype(str)  # Handle NaNs as 'nan' strings temporarily
        encoders[col] = le.fit(df_encoded[col])
        df_encoded[col] = le.transform(df_encoded[col])


In [13]:
###IMPUTING MISSING VALUES USING KNN-IMPUTATION
# Apply KNN Imputer
imputer = KNNImputer(n_neighbors=10)
imputed_array = imputer.fit_transform(df_encoded[categorical_like_cols])

# Round to nearest integer to restore category labels
df_encoded[categorical_like_cols] = np.round(imputed_array).astype(int)



In [14]:
df_encoded

Unnamed: 0,AGE_DX,SEX,RACE,YR_BRTH,GRADE,HISTO3V,BEHO3V,TUMOR_1V,TUMOR_2V,TUMOR_3V,...,FIRSTPRM,STAT_REC,DTH_CLASS,O_DTH_CLASS,EXTEVAL,NODEEVAL,METSEVAL,INTPRIM,ICDOT10V,SRV_TIME_MON
0,63,1,2,1922.0,9,8140,3,0,0,0,...,0,4,9,9,8,8,8,1,C341,84
1,47,1,11,1945.0,3,8140,3,0,0,0,...,1,4,0,1,8,8,8,1,C343,147
2,57,1,1,1927.0,2,8140,3,0,0,0,...,1,4,1,0,8,8,8,1,C343,86
3,63,2,1,1944.0,1,8801,3,1,1,1,...,0,4,9,9,3,0,0,1,C381,0
4,39,2,1,1952.0,1,8012,3,0,0,0,...,1,4,1,0,8,8,8,1,C343,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1058248,80,1,1,1931.0,9,8000,3,1,1,1,...,1,4,1,0,7,7,7,1,C349,9999
1058249,86,1,1,1925.0,9,8000,3,1,1,1,...,1,4,1,0,7,7,7,1,C349,9999
1058250,106,2,1,1905.0,9,8000,3,1,1,1,...,1,4,1,0,7,7,7,1,C349,9999
1058251,78,1,1,1928.0,9,8000,3,1,1,1,...,1,4,1,0,0,0,0,1,C341,42


In [15]:
missing_counts = df_encoded.isnull().sum()
missing_counts

AGE_DX          0
SEX             0
RACE            0
YR_BRTH         0
GRADE           0
HISTO3V         0
BEHO3V          0
TUMOR_1V        0
TUMOR_2V        0
TUMOR_3V        0
SURGPRIM        0
NO_SURG         0
SS_SURG         0
RADIATN         0
RAD_SURG        0
TYPEFUP         0
NUMPRIMS        0
FIRSTPRM        0
STAT_REC        0
DTH_CLASS       0
O_DTH_CLASS     0
EXTEVAL         0
NODEEVAL        0
METSEVAL        0
INTPRIM         0
ICDOT10V        0
SRV_TIME_MON    0
dtype: int64

In [16]:
df_encoded

Unnamed: 0,AGE_DX,SEX,RACE,YR_BRTH,GRADE,HISTO3V,BEHO3V,TUMOR_1V,TUMOR_2V,TUMOR_3V,...,FIRSTPRM,STAT_REC,DTH_CLASS,O_DTH_CLASS,EXTEVAL,NODEEVAL,METSEVAL,INTPRIM,ICDOT10V,SRV_TIME_MON
0,63,1,2,1922.0,9,8140,3,0,0,0,...,0,4,9,9,8,8,8,1,C341,84
1,47,1,11,1945.0,3,8140,3,0,0,0,...,1,4,0,1,8,8,8,1,C343,147
2,57,1,1,1927.0,2,8140,3,0,0,0,...,1,4,1,0,8,8,8,1,C343,86
3,63,2,1,1944.0,1,8801,3,1,1,1,...,0,4,9,9,3,0,0,1,C381,0
4,39,2,1,1952.0,1,8012,3,0,0,0,...,1,4,1,0,8,8,8,1,C343,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1058248,80,1,1,1931.0,9,8000,3,1,1,1,...,1,4,1,0,7,7,7,1,C349,9999
1058249,86,1,1,1925.0,9,8000,3,1,1,1,...,1,4,1,0,7,7,7,1,C349,9999
1058250,106,2,1,1905.0,9,8000,3,1,1,1,...,1,4,1,0,7,7,7,1,C349,9999
1058251,78,1,1,1928.0,9,8000,3,1,1,1,...,1,4,1,0,0,0,0,1,C341,42


In [17]:
df_encoded.to_csv("respiratory_cleaned_enc.csv", index=False)


In [18]:
#df_encoded['SRV_TIME_MON'].unique()
df_encoded['GRADE'].unique()
#df_encoded['SURGPRIM'].unique() 
#df_encoded['SS_SURG'].unique() 
#df_encoded['RADIATN'].unique()
#df_encoded['ICDOT10V'].unique()


array([9, 3, 2, 1, 4])

In [19]:
df_encoded['NO_SURG'].unique()

array([0, 6, 1, 2, 9, 7, 8, 5])

In [20]:
df_encoded = df_encoded[df_encoded['SRV_TIME_MON'] != 9999]
df_encoded = df_encoded[df_encoded['GRADE'] != 9]


In [21]:
df_encoded

Unnamed: 0,AGE_DX,SEX,RACE,YR_BRTH,GRADE,HISTO3V,BEHO3V,TUMOR_1V,TUMOR_2V,TUMOR_3V,...,FIRSTPRM,STAT_REC,DTH_CLASS,O_DTH_CLASS,EXTEVAL,NODEEVAL,METSEVAL,INTPRIM,ICDOT10V,SRV_TIME_MON
1,47,1,11,1945.0,3,8140,3,0,0,0,...,1,4,0,1,8,8,8,1,C343,147
2,57,1,1,1927.0,2,8140,3,0,0,0,...,1,4,1,0,8,8,8,1,C343,86
3,63,2,1,1944.0,1,8801,3,1,1,1,...,0,4,9,9,3,0,0,1,C381,0
4,39,2,1,1952.0,1,8012,3,0,0,0,...,1,4,1,0,8,8,8,1,C343,20
9,72,2,1,1917.0,4,8000,3,0,0,0,...,1,4,1,0,8,8,8,1,C341,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1058107,56,1,1,1955.0,3,8070,3,1,1,1,...,1,1,0,0,1,0,0,1,C321,3
1058108,73,2,2,1938.0,1,8140,3,1,1,1,...,1,1,0,0,3,3,0,1,C343,0
1058133,73,2,1,1938.0,3,8140,3,1,1,1,...,1,1,0,0,3,3,0,1,C343,1
1058181,62,1,1,1948.0,2,8070,3,1,1,1,...,1,1,0,0,0,0,0,1,C341,3


In [22]:
respiratory_df = df_encoded
respiratory_df['YR_BRTH'] = respiratory_df['YR_BRTH'].astype(int)


In [23]:
respiratory_df

Unnamed: 0,AGE_DX,SEX,RACE,YR_BRTH,GRADE,HISTO3V,BEHO3V,TUMOR_1V,TUMOR_2V,TUMOR_3V,...,FIRSTPRM,STAT_REC,DTH_CLASS,O_DTH_CLASS,EXTEVAL,NODEEVAL,METSEVAL,INTPRIM,ICDOT10V,SRV_TIME_MON
1,47,1,11,1945,3,8140,3,0,0,0,...,1,4,0,1,8,8,8,1,C343,147
2,57,1,1,1927,2,8140,3,0,0,0,...,1,4,1,0,8,8,8,1,C343,86
3,63,2,1,1944,1,8801,3,1,1,1,...,0,4,9,9,3,0,0,1,C381,0
4,39,2,1,1952,1,8012,3,0,0,0,...,1,4,1,0,8,8,8,1,C343,20
9,72,2,1,1917,4,8000,3,0,0,0,...,1,4,1,0,8,8,8,1,C341,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1058107,56,1,1,1955,3,8070,3,1,1,1,...,1,1,0,0,1,0,0,1,C321,3
1058108,73,2,2,1938,1,8140,3,1,1,1,...,1,1,0,0,3,3,0,1,C343,0
1058133,73,2,1,1938,3,8140,3,1,1,1,...,1,1,0,0,3,3,0,1,C343,1
1058181,62,1,1,1948,2,8070,3,1,1,1,...,1,1,0,0,0,0,0,1,C341,3


In [24]:
respiratory_df['Survival_5yr'] = np.where(respiratory_df['SRV_TIME_MON'] >= 60, 1, 0)
respiratory_df
respiratory_df = respiratory_df.drop(columns = ['ICDOT10V'])


In [25]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [26]:

leakage_columns = ['SRV_TIME_MON', 'TYPEFUP', 'DTH_CLASS', 'O_DTH_CLASS', 'RAD_SURG', 'RADIATN']
X = respiratory_df.drop(columns=['Survival_5yr'] + leakage_columns)
y = respiratory_df['Survival_5yr']

x_train, x_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y
)

# 2. Build a pipeline: scaling + logistic regression
pipeline = make_pipeline(
    StandardScaler(),
    LogisticRegression(
        solver='lbfgs', 
        C=10.0,
        max_iter=1000,
        random_state=42,
        class_weight = 'balanced'
    )
)

# 3. Perform cross-validation on training data
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
cv_results = cross_validate(
    pipeline, x_train, y_train, 
    cv=5, 
    scoring=scoring, 
    return_train_score=True
)

# 4. Fit the model on full training data
pipeline.fit(x_train, y_train)

# 5. Predict on test data
y_pred = pipeline.predict(x_test)

# 6. Evaluate
print("Training Accuracy (fit):", pipeline.score(x_train, y_train))
print("Testing Accuracy:", pipeline.score(x_test, y_test))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# 7. Cross-validation summary
cv_df = pd.DataFrame(cv_results)
print("\nCross-Validation Results (on Training Data):")
print(cv_df[['train_accuracy', 'test_accuracy', 'test_precision', 'test_recall', 'test_f1', 'test_roc_auc']].mean())


Training Accuracy (fit): 0.7870311861130729
Testing Accuracy: 0.7880812548786381

Confusion Matrix:
 [[71615 19255]
 [ 3550 13192]]

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.79      0.86     90870
           1       0.41      0.79      0.54     16742

    accuracy                           0.79    107612
   macro avg       0.68      0.79      0.70    107612
weighted avg       0.87      0.79      0.81    107612


Cross-Validation Results (on Training Data):
train_accuracy    0.787087
test_accuracy     0.786934
test_precision    0.404581
test_recall       0.783311
test_f1           0.533562
test_roc_auc      0.865770
dtype: float64


In [27]:
leakage_columns = ['SRV_TIME_MON', 'TYPEFUP', 'DTH_CLASS', 'O_DTH_CLASS', 'RAD_SURG', 'RADIATN']
X = respiratory_df.drop(columns=['Survival_5yr'] + leakage_columns)
y = respiratory_df['Survival_5yr']

x_train, x_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y
)

# Step 2: Build pipeline (scaling is optional for RF, can be removed)
pipeline_rf = make_pipeline(
    StandardScaler(),  # Optional for tree-based models
    RandomForestClassifier(
        n_estimators=100,
        max_depth=None,
        class_weight='balanced',  # Handles class imbalance
        random_state=42
    )
)

# Step 3: Cross-validation
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
cv_results_rf = cross_validate(
    pipeline_rf, x_train, y_train,
    cv=5,
    scoring=scoring,
    return_train_score=True
)

# Step 4: Train on full training set
pipeline_rf.fit(x_train, y_train)

# Step 5: Predict & evaluate on test set
y_pred_rf = pipeline_rf.predict(x_test)

print("=== Random Forest Results ===")
print("Training Accuracy:", pipeline_rf.score(x_train, y_train))
print("Testing Accuracy:", pipeline_rf.score(x_test, y_test))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))

# Step 6: Show CV metrics
cv_df_rf = pd.DataFrame(cv_results_rf)
print("\nCross-Validation Averages:")
print(cv_df_rf[['train_accuracy', 'test_accuracy', 'test_precision', 'test_recall', 'test_f1', 'test_roc_auc']].mean())

=== Random Forest Results ===
Training Accuracy: 0.9732023380292161
Testing Accuracy: 0.8874567892056648

Confusion Matrix:
 [[85813  5057]
 [ 7054  9688]]

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.94      0.93     90870
           1       0.66      0.58      0.62     16742

    accuracy                           0.89    107612
   macro avg       0.79      0.76      0.77    107612
weighted avg       0.88      0.89      0.88    107612


Cross-Validation Averages:
train_accuracy    0.976384
test_accuracy     0.886971
test_precision    0.660895
test_recall       0.561799
test_f1           0.607301
test_roc_auc      0.887709
dtype: float64


In [28]:
from lightgbm import LGBMClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd

# Step 1: Define leakage columns to drop
leakage_columns = ['SRV_TIME_MON', 'TYPEFUP', 'DTH_CLASS', 'O_DTH_CLASS', 'RAD_SURG', 'RADIATN']

# Step 2: Define X and y
X = respiratory_df.drop(columns=['Survival_5yr'] + leakage_columns)
y = respiratory_df['Survival_5yr']

# Step 3: Train-test split
x_train, x_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# Step 4: Create LightGBM pipeline (no scaling needed for trees)
pipeline_lgbm = make_pipeline(
    LGBMClassifier(class_weight='balanced', random_state=42)
)

# Step 5: Cross-validation
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

cv_results_lgbm = cross_validate(
    pipeline_lgbm, x_train, y_train,
    cv=5,
    scoring=scoring,
    return_train_score=True
)

# Step 6: Fit model on full training data
pipeline_lgbm.fit(x_train, y_train)

# Step 7: Predict on test set
y_pred_lgbm = pipeline_lgbm.predict(x_test)

# Step 8: Evaluation
print("\n=== LightGBM Results ===")
print("Training Accuracy:", pipeline_lgbm.score(x_train, y_train))
print("Testing Accuracy:", pipeline_lgbm.score(x_test, y_test))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_lgbm))
print("\nClassification Report:\n", classification_report(y_test, y_pred_lgbm))

# Step 9: Show cross-validation metrics
cv_df_lgbm = pd.DataFrame(cv_results_lgbm)
print("\nCross-Validation Averages:")
print(cv_df_lgbm[['train_accuracy', 'test_accuracy', 'test_precision', 'test_recall', 'test_f1', 'test_roc_auc']].mean())


[LightGBM] [Info] Number of positive: 53574, number of negative: 290784
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013381 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 525
[LightGBM] [Info] Number of data points in the train set: 344358, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 53573, number of negative: 290785
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011097 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 528
[LightGBM] [Info] Number of data points in the train set: 344358, number of used features: 20
[LightGBM] [Info] 

In [29]:
from xgboost import XGBClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd

# Step 1: Define leakage columns to exclude
leakage_columns = ['SRV_TIME_MON', 'TYPEFUP', 'DTH_CLASS', 'O_DTH_CLASS', 'RAD_SURG', 'RADIATN']

# Step 2: Define features and target
X = respiratory_df.drop(columns=['Survival_5yr'] + leakage_columns)
y = respiratory_df['Survival_5yr']

# Step 3: Train-test split
x_train, x_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# Step 4: Create XGBoost pipeline
# Use scale_pos_weight to help with class imbalance
scale_weight = (y_train == 0).sum() / (y_train == 1).sum()

pipeline_xgb = make_pipeline(
    XGBClassifier(
        use_label_encoder=False,
        eval_metric='logloss',
        scale_pos_weight=scale_weight,
        random_state=42
    )
)

# Step 5: Define scoring metrics
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

# Step 6: Cross-validation
cv_results_xgb = cross_validate(
    pipeline_xgb, x_train, y_train,
    cv=5,
    scoring=scoring,
    return_train_score=True
)

# Step 7: Train on full training set
pipeline_xgb.fit(x_train, y_train)

# Step 8: Predict on test set
y_pred_xgb = pipeline_xgb.predict(x_test)

# Step 9: Evaluation
print("\n=== XGBoost Results ===")
print("Training Accuracy:", pipeline_xgb.score(x_train, y_train))
print("Testing Accuracy:", pipeline_xgb.score(x_test, y_test))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_xgb))

# Step 10: Show cross-validation averages
cv_df_xgb = pd.DataFrame(cv_results_xgb)
print("\nCross-Validation Averages:")
print(cv_df_xgb[['train_accuracy', 'test_accuracy', 'test_precision', 'test_recall', 'test_f1', 'test_roc_auc']].mean())


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




=== XGBoost Results ===
Training Accuracy: 0.8461904806155447
Testing Accuracy: 0.8427591718395718

Confusion Matrix:
 [[75941 14929]
 [ 1992 14750]]

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.84      0.90     90870
           1       0.50      0.88      0.64     16742

    accuracy                           0.84    107612
   macro avg       0.74      0.86      0.77    107612
weighted avg       0.90      0.84      0.86    107612


Cross-Validation Averages:
train_accuracy    0.846846
test_accuracy     0.843245
test_precision    0.497864
test_recall       0.878298
test_f1           0.635491
test_roc_auc      0.935543
dtype: float64


In [30]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd

# Step 1: Drop leakage columns
leakage_columns = ['SRV_TIME_MON', 'TYPEFUP', 'DTH_CLASS', 'O_DTH_CLASS', 'RAD_SURG', 'RADIATN']
X = respiratory_df.drop(columns=['Survival_5yr'] + leakage_columns)
y = respiratory_df['Survival_5yr']

# Step 2: Train-test split
x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Step 3: Build the pipeline
pipeline_svm = make_pipeline(
    StandardScaler(),  # SVMs need scaled data
    LinearSVC(class_weight='balanced', max_iter=10000, random_state=42)
)

# Step 4: Cross-validation
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
cv_results = cross_validate(
    pipeline_svm, x_train, y_train,
    cv=5, scoring=scoring, return_train_score=True
)

# Step 5: Train full model
pipeline_svm.fit(x_train, y_train)
y_pred = pipeline_svm.predict(x_test)

# Step 6: Evaluation
print("\n=== LinearSVC (SVM) Results ===")
print("Training Accuracy:", pipeline_svm.score(x_train, y_train))
print("Testing Accuracy:", pipeline_svm.score(x_test, y_test))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Step 7: Cross-validation results
cv_df = pd.DataFrame(cv_results)
print("\nCross-Validation Averages:")
print(cv_df[['train_accuracy', 'test_accuracy', 'test_precision', 'test_recall', 'test_f1', 'test_roc_auc']].mean())



=== LinearSVC (SVM) Results ===
Training Accuracy: 0.78582081923949
Testing Accuracy: 0.7868546258781548

Confusion Matrix:
 [[71489 19381]
 [ 3556 13186]]

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.79      0.86     90870
           1       0.40      0.79      0.53     16742

    accuracy                           0.79    107612
   macro avg       0.68      0.79      0.70    107612
weighted avg       0.87      0.79      0.81    107612


Cross-Validation Averages:
train_accuracy    0.785915
test_accuracy     0.785795
test_precision    0.403055
test_recall       0.783296
test_f1           0.532230
test_roc_auc      0.864848
dtype: float64
