In [1]:
import pandas as pd
import numpy as np

In [2]:
cancer = pd.read_csv(r"D:\Personal Projects\cancer_issue_cleaned.csv")
cancer.head()

Unnamed: 0,Age,Gender,Race/Ethnicity,BMI,SmokingStatus,FamilyHistory,CancerType,Stage,TumorSize,TreatmentType,TreatmentResponse,SurvivalMonths,Recurrence,GeneticMarker,HospitalRegion
0,80,Female,Other,23.3,Smoker,Yes,Breast,II,1.7,Combination Therapy,No Response,103,Yes,Not Taken,South
1,76,Male,Caucasian,22.4,Former Smoker,Yes,Colon,IV,4.7,Surgery,No Response,14,Yes,BRCA1,West
2,69,Male,Asian,21.5,Smoker,Yes,Breast,III,8.3,Combination Therapy,Complete Remission,61,Yes,BRCA1,West
3,77,Male,Asian,30.4,Former Smoker,Yes,Prostate,II,1.7,Radiation,Partial Remission,64,No,KRAS,South
4,89,Male,Caucasian,20.9,Smoker,Yes,Lung,IV,7.4,Radiation,No Response,82,Yes,KRAS,South


In [3]:
categorical_variables = ['Gender','Race/Ethnicity','SmokingStatus','FamilyHistory','CancerType','Stage','TreatmentType',
                        'TreatmentResponse','GeneticMarker', 'HospitalRegion']
target_variable =  'Recurrence'

## Encoding Variables

#### I am using One Hot encoding since most of the data are nominal type. For the output feature, I an using level encoding

In [4]:
from sklearn.preprocessing import OneHotEncoder,LabelEncoder

In [7]:
one_hot_encoder = OneHotEncoder(sparse_output=False, drop='first')
one_hot_encoded = pd.DataFrame(
    one_hot_encoder.fit_transform(cancer[categorical_variables]),
    columns=one_hot_encoder.get_feature_names_out(categorical_variables)
)

In [10]:
label_encoder = LabelEncoder()
cancer['Recurrence'] = label_encoder.fit_transform(cancer['Recurrence'])

In [12]:
cancer = pd.concat([cancer.drop(columns=categorical_variables), one_hot_encoded], axis=1)

In [13]:
cancer.head()

Unnamed: 0,Age,BMI,TumorSize,SurvivalMonths,Recurrence,Gender_Male,Race/Ethnicity_Asian,Race/Ethnicity_Caucasian,Race/Ethnicity_Hispanic,Race/Ethnicity_Other,...,TreatmentType_Radiation,TreatmentType_Surgery,TreatmentResponse_No Response,TreatmentResponse_Partial Remission,GeneticMarker_EGFR,GeneticMarker_KRAS,GeneticMarker_Not Taken,HospitalRegion_North,HospitalRegion_South,HospitalRegion_West
0,80,23.3,1.7,103,1,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,76,22.4,4.7,14,1,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,69,21.5,8.3,61,1,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,77,30.4,1.7,64,0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,89,20.9,7.4,82,1,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


#### All the ordinal categorical variables are encoded and the first columns of each variables are dropped to avaid dummy variable trap.

In [44]:
cancer.to_csv('cancer_encoded_dataset.csv', index = False)

## Normalize Numeric Columns

In [14]:
numeric_columns = ['Age', 'BMI', 'TumorSize', 'SurvivalMonths']

In [23]:
from sklearn.preprocessing import MinMaxScaler

In [28]:
normalize = MinMaxScaler()
normalize_numeric = pd.DataFrame(
    normalize.fit_transform(cancer[numeric_columns]),
    columns=numeric_columns,
    index=cancer.index
)

In [29]:
cancer = pd.concat([cancer.drop(columns=numeric_columns), normalize_numeric], axis=1)

In [30]:
cancer.head()

Unnamed: 0,Recurrence,Gender_Male,Race/Ethnicity_Asian,Race/Ethnicity_Caucasian,Race/Ethnicity_Hispanic,Race/Ethnicity_Other,SmokingStatus_Non-Smoker,SmokingStatus_Smoker,FamilyHistory_Yes,CancerType_Colon,...,GeneticMarker_EGFR,GeneticMarker_KRAS,GeneticMarker_Not Taken,HospitalRegion_North,HospitalRegion_South,HospitalRegion_West,Age,BMI,TumorSize,SurvivalMonths
0,1,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.861111,0.223256,0.077778,0.857143
1,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.805556,0.181395,0.411111,0.109244
2,1,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.708333,0.139535,0.811111,0.504202
3,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.819444,0.553488,0.077778,0.529412
4,1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.986111,0.111628,0.711111,0.680672


## Feature Selection

In [32]:
from sklearn.ensemble import RandomForestClassifier  # You can replace this with another estimator
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split

In [33]:
X = cancer.drop(columns=[target_variable])
y = cancer[target_variable]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [37]:
model = RandomForestClassifier(random_state=42)

n_features_to_select = 15  
rfe = RFE(estimator=model, n_features_to_select=n_features_to_select)
rfe.fit(X_train, y_train)

selected_features = X_train.columns[rfe.support_]
selected_features

Index(['Gender_Male', 'Race/Ethnicity_Other', 'SmokingStatus_Smoker',
       'FamilyHistory_Yes', 'Stage_II', 'Stage_IV', 'TreatmentType_Surgery',
       'TreatmentResponse_No Response', 'TreatmentResponse_Partial Remission',
       'GeneticMarker_EGFR', 'HospitalRegion_South', 'Age', 'BMI', 'TumorSize',
       'SurvivalMonths'],
      dtype='object')

In [41]:
# Check feature rankings
rfe_ranking = pd.DataFrame({
    'Feature': X_train.columns,
    'Ranking': rfe.ranking_
}).sort_values(by='Ranking')

rfe_ranking.head()

Unnamed: 0,Feature,Ranking
0,Gender_Male,1
28,BMI,1
27,Age,1
25,HospitalRegion_South,1
21,GeneticMarker_EGFR,1


#### The **Recursive Feature Elimination (RFE)** technique is used for feature selection because it eliminates features based on their importance to the chosen model. I selected the top 15 features to reduce model complexity and enhance performance, ensuring a balance between accuracy and efficiency.