<a href="https://colab.research.google.com/github/Ronydavison007/ml_mini_projects/blob/main/Lung%20Cancer%20Diagnosis/Lung_Cancer_Diagnosis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data_file = '/content/drive/My Drive/Colab Notebooks/Lung Cancer Diagnosis/dataset_med.csv'
load_data = pd.read_csv(data_file)
data = pd.DataFrame(load_data)

In [4]:
print(data.head())
print(data.info())
print(data.describe())

   id   age  gender      country diagnosis_date cancer_stage family_history  \
0   1  64.0    Male       Sweden     2016-04-05      Stage I            Yes   
1   2  50.0  Female  Netherlands     2023-04-20    Stage III            Yes   
2   3  65.0  Female      Hungary     2023-04-05    Stage III            Yes   
3   4  51.0  Female      Belgium     2016-02-05      Stage I             No   
4   5  37.0    Male   Luxembourg     2023-11-29      Stage I             No   

   smoking_status   bmi  cholesterol_level  hypertension  asthma  cirrhosis  \
0  Passive Smoker  29.4                199             0       0          1   
1  Passive Smoker  41.2                280             1       1          0   
2   Former Smoker  44.0                268             1       1          0   
3  Passive Smoker  43.0                241             1       1          0   
4  Passive Smoker  19.7                178             0       0          0   

   other_cancer treatment_type end_treatment_date 

In [5]:
# Check for missing values
print(data.isnull().sum())

# Fill missing values in continuous columns like 'bmi', 'cholesterol_level'
data['bmi'] = data['bmi'].fillna(data['bmi'].median())
data['cholesterol_level'] = data['cholesterol_level'].fillna(data['cholesterol_level'].median())

# For categorical columns, you can use mode or a separate category like 'unknown'
data['smoking_status'] = data['smoking_status'].fillna('unknown')


id                    0
age                   0
gender                0
country               0
diagnosis_date        0
cancer_stage          0
family_history        0
smoking_status        0
bmi                   0
cholesterol_level     0
hypertension          0
asthma                0
cirrhosis             0
other_cancer          0
treatment_type        0
end_treatment_date    0
survived              0
dtype: int64


In [6]:
data['diagnosis_date'] = pd.to_datetime(data['diagnosis_date'])
data['end_treatment_date'] = pd.to_datetime(data['end_treatment_date'])

# Calculate treatment duration in days
data['treatment_duration'] = (data['end_treatment_date'] - data['diagnosis_date']).dt.days


In [7]:
# One-hot encode categorical variables
data = pd.get_dummies(data, columns=['gender', 'smoking_status', 'cancer_stage', 'treatment_type', 'family_history', 'hypertension', 'asthma', 'cirrhosis', 'other_cancer'], drop_first=True)
data = data.drop(columns=['id', 'diagnosis_date', 'end_treatment_date', 'country'])


In [8]:
x = data.drop(columns='survived')
y = data['survived']
print(x)
print(y)


         age   bmi  cholesterol_level  treatment_duration  gender_Male  \
0       64.0  29.4                199                 523         True   
1       50.0  41.2                280                 424        False   
2       65.0  44.0                268                 370        False   
3       51.0  43.0                241                 443        False   
4       37.0  19.7                178                 406         True   
...      ...   ...                ...                 ...          ...   
889995  40.0  44.8                243                 237         True   
889996  62.0  21.6                240                 631        False   
889997  48.0  38.6                242                 302        False   
889998  67.0  18.6                194                 721        False   
889999  55.0  42.8                250                 450        False   

        smoking_status_Former Smoker  smoking_status_Never Smoked  \
0                              False      

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [10]:
dt = RandomForestClassifier()
# train the model
dt.fit(x_train, y_train)
# make predictions
dt_pred = dt.predict(x_test)

In [11]:
print("Accuracy of random forest classifier: ",
      accuracy_score(y_test, dt_pred))
# print other performance metrics
print("Precision of random forest classifier: ",
      precision_score(y_test, dt_pred, average='weighted'))
print("Recall of random forest classifier: ",
      recall_score(y_test, dt_pred, average='weighted'))
print("F1-Score of random forest classifier: ",
      f1_score(y_test, dt_pred, average='weighted'))

Accuracy of random forest classifier:  0.7775393258426966
Precision of random forest classifier:  0.6537422336833931
Recall of random forest classifier:  0.7775393258426966
F1-Score of random forest classifier:  0.6823186396437132
