In [1]:
data_dir_path = '/content/drive/MyDrive/Lung_cancer'

In [3]:
import os
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve, auc
from sklearn.preprocessing import OneHotEncoder, label_binarize, StandardScaler
import matplotlib.pyplot as plt
import json
import joblib
import seaborn as sns

target_column = 'Lung Cancer Occurrence'
merged_df = pd.read_csv(f'{data_dir_path}/dataset.csv')

In [None]:
import sklearn
sklearn.__version__

'1.2.2'

In [None]:
joblib.__version__

'1.3.2'

In [None]:
merged_df.columns

Index(['Age', 'Gender', 'COPD History', 'Genetic Markers',
       'Air Pollution Exposure', 'Last Weight', 'Current Weight',
       'Start Smoking', 'Taken Bronchodilators', 'Frequency of Tiredness',
       'Dominant Hand', 'Lung Cancer Occurrence'],
      dtype='object')

In [4]:


#merged_df.drop('Unnamed: 0', axis=1, inplace=True)

print(len(merged_df.columns.tolist()))
print(list(merged_df.columns))
X = merged_df.drop(target_column, axis=1)
y = merged_df[target_column]

y_binarized = label_binarize(y, classes=np.unique(y))

categorical_cols = X.select_dtypes(include=['object', 'category']).columns
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns

with open('/content/drive/MyDrive/Lung_cancer/config.json', 'r') as file:
    param_grid = json.load(file)
def apply_ohe(X):
    ohe = OneHotEncoder(handle_unknown='ignore')
    X_encoded = ohe.fit_transform(X[categorical_cols]).toarray()
    joblib.dump(ohe, f'{data_dir_path}/ohe.pkl')
    return pd.DataFrame(X_encoded, columns=ohe.get_feature_names_out(categorical_cols))

def apply_standardization(X, X_encoded=None):
    if X_encoded is not None:
        X_numerical = X[numerical_cols].reset_index(drop=True)
        X_combined = pd.concat([X_numerical, X_encoded], axis=1)
    else:
        X_combined = X[numerical_cols]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_combined)
    joblib.dump(scaler, f'{data_dir_path}/scaler.pkl')
    return pd.DataFrame(X_scaled, columns=X_combined.columns)

def get_train_test_data(ohe_required, normalization_required):
    X_processed = X.copy()
    X_encoded = None
    if ohe_required:
        X_encoded = apply_ohe(X)
        X_processed = pd.concat([X[numerical_cols], X_encoded], axis=1)
    if normalization_required:
        X_processed = apply_standardization(X, X_encoded)
    cols = list(X_processed.columns)
    train_cols = []
    with open(f'{data_dir_path}/features.txt', 'r') as fp:
      train_cols = fp.readlines()
      train_cols = [col.strip() for col in train_cols]
    return X_processed

12
['Age', 'Gender', 'COPD History', 'Genetic Markers', 'Air Pollution Exposure', 'Last Weight', 'Current Weight', 'Start Smoking', 'Taken Bronchodilators', 'Frequency of Tiredness', 'Dominant Hand', 'Lung Cancer Occurrence']


In [5]:
rf_estimator = param_grid['RandomForest']
X_test = get_train_test_data(rf_estimator['ohe'], rf_estimator['standardize'])
model = joblib.load(f'{data_dir_path}/RandomForest_model.pkl')
results = {}
y_pred = model.predict(X_test)
accuracy = accuracy_score(y, y_pred)
class_report = classification_report(y, y_pred)
conf_matrix = confusion_matrix(y, y_pred)

results['RandomForest'] = {
            'Accuracy': accuracy,
            'Classification Report': class_report,
            'Confusion Matrix': conf_matrix.tolist()
}

In [7]:
import pandas as pd
pd.__version__

'1.5.3'

In [6]:
results

{'RandomForest': {'Accuracy': 0.8685737147429486,
  'Classification Report': '              precision    recall  f1-score   support\n\n           0       0.88      0.83      0.85      4559\n           1       0.86      0.90      0.88      5439\n\n    accuracy                           0.87      9998\n   macro avg       0.87      0.87      0.87      9998\nweighted avg       0.87      0.87      0.87      9998\n',
  'Confusion Matrix': [[3765, 794], [520, 4919]]}}