In [None]:

import tensorflow as tf
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, f1_score,confusion_matrix, mean_absolute_error , r2_score , mean_squared_error, mean_absolute_percentage_error
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import compute_class_weight
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, LeakyReLU,BatchNormalization
from keras import regularizers
from keras.callbacks import EarlyStopping,ReduceLROnPlateau
# from tensorflow.keras.callbacks import EarlyStopping
import warnings
warnings.filterwarnings("ignore")

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import classification_report, confusion_matrix



In [None]:
df = pd.read_csv("./datasets/cancer_patient.csv")
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
for col in df.columns:
    print(df[col].describe())
    print("\n")

In [None]:
df.isnull().any()

In [None]:
df.duplicated().sum()

In [None]:
for col in df.columns:
    print(df[col].value_counts())

## Data Processing

In [None]:
labels = {
    0: "extremely low",
    1: "very low",
    2: "low",
    3: "below medium average",
    4: "medium",
    5: "above medium average",
    6: "high",
    7: "very high",
    8: "extremely high",
    9: "Maximum"
}

In [None]:
plt.title("Age Range in Data")
plt.bar(df['Age'], height=df.shape[0],width=.5)

In [None]:
plt.figure(figsize = (7,4))
sns.kdeplot(df.Age, shade = True, color = "g")
plt.title("Age Count", fontsize = 18)
plt.show()


In [None]:
df1 = df.drop(["Level","Patient Id","index"],axis=1)

### Features correlation

In [None]:
plt.subplots(figsize = (24, 24))
sns.heatmap(df1.corr(), annot = True)


In [None]:
correlationC=[]
for col in df1.columns :
    for column in df1.columns:
        a = df1[col].corr(df1[column])
        if a > 0.8 and (col != column) :
            if a in correlationC:
                continue
            else :
                correlationC.append(a)         
                print (col +" column has high correlation with column "+ column)
                print(df1[col].corr(df1[column]))
                print("\n")

In [None]:
correlationC.sort()
correlationC.reverse()
correlationC

### Decoding the data

In [None]:
df["Gender"] = df["Gender"].replace([1,2],["Female","Male"])
df["Air Pollution"] = df["Air Pollution"].replace([1,2,3,4,5,6,7,8],[labels[1],labels[2],labels[3],labels[4],labels[5],labels[6],labels[7],labels[8]])
df["Dust Allergy"] = df["Dust Allergy"].replace([1,2,3,4,5,6,7,8],[labels[1],labels[2],labels[3],labels[4],labels[5],labels[6],labels[7],labels[8]])
df["OccuPational Hazards"] = df["OccuPational Hazards"].replace([1,2,3,4,5,6,7,8],[labels[1],labels[2],labels[3],labels[4],labels[5],labels[6],labels[7],labels[8]])
df["Genetic Risk"] = df["Genetic Risk"].replace([1,2,3,4,5,6,7],[labels[1],labels[2],labels[3],labels[4],labels[5],labels[6],labels[7]])
df["chronic Lung Disease"] = df["chronic Lung Disease"].replace([1,2,3,4,5,6,7],[labels[1],labels[2],labels[3],labels[4],labels[5],labels[6],labels[7]])
df["Fatigue"] = df["Fatigue"].replace([1,2,3,4,5,6,7,8,9],[labels[1],labels[2],labels[3],labels[4],labels[5],labels[6],labels[7],labels[8],labels[9]])
df["Weight Loss"] = df["Weight Loss"].replace([1,2,3,4,5,6,7,8],[labels[1],labels[2],labels[3],labels[4],labels[5],labels[6],labels[7],labels[8]])
df["Shortness of Breath"] = df["Shortness of Breath"].replace([1,2,3,4,5,6,7,8,9],[labels[1],labels[2],labels[3],labels[4],labels[5],labels[6],labels[7],labels[8],labels[9]])
df["Wheezing"] = df["Wheezing"].replace([1,2,3,4,5,6,7,8],[labels[1],labels[2],labels[3],labels[4],labels[5],labels[6],labels[7],labels[8]])
df["Swallowing Difficulty"] = df["Swallowing Difficulty"].replace([1,2,3,4,5,6,7,8],[labels[1],labels[2],labels[3],labels[4],labels[5],labels[6],labels[7],labels[8]])
df["Clubbing of Finger Nails"] = df["Clubbing of Finger Nails"].replace([1,2,3,4,5,6,7,8,9],[labels[1],labels[2],labels[3],labels[4],labels[5],labels[6],labels[7],labels[8],labels[9]])
df["Frequent Cold"] = df["Frequent Cold"].replace([1,2,3,4,5,6,7],[labels[1],labels[2],labels[3],labels[4],labels[5],labels[6],labels[7]])
df["Dry Cough"] = df["Dry Cough"].replace([1,2,3,4,5,6,7],[labels[1],labels[2],labels[3],labels[4],labels[5],labels[6],labels[7]])
df["Snoring"] = df["Snoring"].replace([1,2,3,4,5,6,7],[labels[1],labels[2],labels[3],labels[4],labels[5],labels[6],labels[7]])
df["Obesity"] = df["Obesity"].replace([1,2,3,4,5,6,7],[labels[1],labels[2],labels[3],labels[4],labels[5],labels[6],labels[7]])
df["Passive Smoker"] = df["Passive Smoker"].replace([1,2,3,4,5,6,7,8],[labels[1],labels[2],labels[3],labels[4],labels[5],labels[6],labels[7],labels[8]])
df

In [None]:
df["Passive Smoker"].value_counts()

In [None]:
df.Level.value_counts()

In [None]:
plt.figure(figsize=(6,7))
plt.title("Level Column Values")
round(df["Level"].value_counts()/df.shape[0]*100,2).plot.pie(autopct= '%2.1f%%')

### Relation between Occupatinal Hazard and Genetic Risk

In [None]:
f = plt.figure(figsize=(18, 6))
plt.xlabel("OccuPational Hazards")
plt.ylabel("Genetic Risk")
plt.title("Relation between OccuPational Hazards and Genetic Risk")
plt.scatter(df["OccuPational Hazards"],df["Genetic Risk"],alpha=0.7,c="red",linewidths=7)


#### Gender

In [None]:
plt.subplots(figsize=(20, 7))

sns.histplot(data=df, x=df["OccuPational Hazards"], hue=df["Gender"], multiple="dodge", shrink=.8,palette=["red","blue"]).set(title='Relation Between OccuPational Hazards and Gender')

In [None]:
df.Gender.value_counts()

#### Smokers

In [None]:
plt.subplots(figsize=(20, 7))
plt.title("Number of Passive Smokers")
sns.histplot(data=df, x=df["Passive Smoker"],color= "darkcyan")

#### Alcohol

In [None]:
df3= df.copy()

df3['AgeRange'] = None
df3.loc[(df3['Age'] > 14) & (df3["Age"] < 18), 'AgeRange'] = "Teenager"
df3.loc[(df3['Age'] > 18) & (df3["Age"] < 45), 'AgeRange'] = "Adult"
df3.loc[(df3['Age'] > 45) & (df3["Age"] < 74), 'AgeRange'] = "senile"


sns.catplot(data=df3, x="AgeRange", y="Alcohol use",aspect=20/10,height=5).set(title='Relation Between Age and Alcohol use')

### Relation Genetic Risk and chronic Lung Disease 

In [None]:
plt.subplots(figsize=(20, 7))

plt.hist(df['Genetic Risk'], bins=25, alpha=0.45, color='red')
plt.hist(df['chronic Lung Disease'], bins=50, alpha=0.45, color='blue')

plt.title("Relation Genetic Risk and chronic Lung Disease ")

  
plt.legend(['Genetic Risk', 'chronic Lung Disease'])

In [None]:
df6 = pd.read_csv("./datasets/cancer_patient.csv", sep=",",encoding="UTF-8")

plt.subplots(figsize=(20, 7))

plt.hist(df6['Genetic Risk'], bins=25, alpha=0.45, color='red')
plt.hist(df6['chronic Lung Disease'], bins=50, alpha=0.45, color='blue')

plt.title("Relation Genetic Risk and chronic Lung Disease ")
  
plt.legend(['Genetic Risk', 
            'chronic Lung Disease'])

In [None]:
f = plt.figure(figsize=(18, 6))
plt.xlabel("OccuPational Hazards")
plt.ylabel("Genetic Risk")
plt.title("Relation between OccuPational Hazards and Genetic Risk")
plt.scatter(df6["OccuPational Hazards"],df6["Genetic Risk"],alpha=0.7,c="blue",linewidths=7)

### Relation between Obesity and Coughing of Blood

In [None]:
plt.figure(figsize=(18, 6))
plt.title("Relation between Obesity and Coughing of Blood")
sns.barplot(data=df, x="Obesity", y="Coughing of Blood",palette="husl")

In [None]:
plt.figure(figsize=(18, 6))
plt.title("Relation between Obesity and Coughing of Blood")
sns.barplot(data=df6, x="Obesity", y="Coughing of Blood",palette="husl")

In [None]:
df6.plot(kind='box', subplots=True, layout=(5,5), figsize=(18,15))
plt.show()

In [None]:
plt.title("Age Column Box Plot")
sns.boxenplot(x=df.Age)


In [None]:
df6.dtypes

## Encoding

In [None]:
le = LabelEncoder()

# transforming the datatype
df6['Level'] = le.fit_transform(df6['Level'])
df6['Patient Id'] = le.fit_transform(df6['Patient Id'])

In [None]:
df6.dtypes

In [None]:
x = df6.drop(["Level","Patient Id","index"],axis=1).values
y = df.Level.values

### Splitting the data

In [None]:
x_train, x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
x_train

In [None]:
x_test

In [None]:
y_train

In [None]:
y_test

In [None]:
s_scaler = StandardScaler().fit(x_train)
x_train_scaled = s_scaler.transform(x_train)
x_test_scaled = s_scaler.transform(x_test)

In [None]:
def resultsSummarizer(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average='macro')
    rec = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    
    
    plt.figure(figsize=(10,8))
    
    sns.heatmap(cm,
                annot=True,
                cmap='Blues',
                xticklabels=labels.values(),
                yticklabels=labels.values()
               ) 
    
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Activity')
    plt.ylabel('Actual Activity')
    plt.show()
    
    print(f'Accuracy Score: ' + '{:.4%}'.format(acc))
    print(f'Precision Score: ' + '{:.4%}'.format(prec))
    print(f'Recall Score: ' + '{:.4%}'.format(rec))
    print(f'F_1 Score: ' + '{:.4%}'.format(f1))

## Logistic Regression

In [None]:
lr = LogisticRegression()
lr.fit(x_train , y_train)

In [None]:
lr.score(x_train , y_train)

In [None]:
lr.score(x_test , y_test)

In [None]:
y_pred=lr.predict(x_test)
data=pd.DataFrame({'y_Test  ':y_test,'y_pred  ':y_pred})
data[:20]

In [None]:
plt.figure(figsize=(7, 6)) 
cm=confusion_matrix(y_test, y_pred)
sns.heatmap(cm,annot=True, cmap='Blues') 
plt.title('Confusion Matrix')
plt.xlabel('Predicted Activity')
plt.ylabel('Actual Activity')
plt.show()


## KNN

In [None]:
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train,y_train)

In [None]:
knn.score(x_train,y_train)

In [None]:
knn.score(x_test,y_test)

In [None]:
y_pred=knn.predict(x_test)
data=pd.DataFrame({'y_Test  ':y_test,'y_pred  ':y_pred})
data[:20]

In [None]:
knn2 = KNeighborsClassifier(n_neighbors=5)
knn2.fit(x_train_scaled, y_train)
y_pred_knn = knn2.predict(x_test_scaled)

In [None]:
resultsSummarizer(y_test, y_pred_knn)

In [None]:
plt.figure(figsize=(7, 6)) 
cm=confusion_matrix(y_test, y_pred_knn)
sns.heatmap(cm,annot=True, cmap='Blues') 
plt.title('Confusion Matrix')
plt.xlabel('Predicted Activity')
plt.ylabel('Actual Activity')
plt.show()

## Random Forest

In [None]:
rfst = RandomForestClassifier(n_estimators=10,random_state=42)
rfst.fit(x_train_scaled, y_train)
y_pred_rfst = rfst.predict(x_test_scaled)

In [None]:
resultsSummarizer(y_pred_rfst, y_test)

In [None]:
plt.figure(figsize=(7, 6)) 
cm=confusion_matrix(y_test, y_pred_rfst)
sns.heatmap(cm,annot=True, cmap='Blues') 
plt.title('Confusion Matrix')
plt.xlabel('Predicted Activity')
plt.ylabel('Actual Activity')
plt.show()


## Grid Search

In [None]:
C_grid = 0.02*np.arange(1,20)
gamma_grid = 0.02*np.arange(1,50)
print(C_grid,gamma_grid)
parameters = {'C': C_grid, 'gamma' : gamma_grid}
gridCV = GridSearchCV(SVC(kernel='rbf'), parameters, n_jobs=-1)             
gridCV.fit(x_train[:1000],y_train[:1000])
best_C = gridCV.best_params_['C']
best_gamma = gridCV.best_params_['gamma']

print("Best C "+str(best_C))
print("Best Gamma "+str(best_gamma))

## SVM

In [None]:
## Linear Kernel
lin = SVC(kernel='linear',C=best_C,gamma=best_gamma)
lin.fit(x_train, y_train)

In [None]:
y_pred_Linear = lin.predict(x_test)
print(classification_report(y_test, y_pred_Linear))

In [None]:
plt.figure(figsize=(7, 6)) 
cm=confusion_matrix(y_test, y_pred_Linear)
sns.heatmap(cm,annot=True, cmap='Blues') 
plt.title('Confusion Matrix')
plt.xlabel('Predicted Activity')
plt.ylabel('Actual Activity')
plt.show()

In [None]:
resultsSummarizer(y_pred_Linear, y_test)

In [None]:
# RBF Kernel
rbf = SVC(kernel='rbf',C=1,gamma=0.6)
rbf.fit(x_train, y_train)

y_pred_RBF = rbf.predict(x_test)
print(classification_report(y_test, y_pred_RBF))

In [None]:
plt.figure(figsize=(7, 6)) 
cm=confusion_matrix(y_test, y_pred_RBF)
sns.heatmap(cm,annot=True, cmap='Blues') 
plt.title('Confusion Matrix')
plt.xlabel('Predicted Activity')
plt.ylabel('Actual Activity')
plt.show()

In [None]:
resultsSummarizer(y_test, y_pred_RBF)

## Naive Bayes
### Gaussian Naive Bayes

In [None]:
# Initialize the Naive Bayes model
nb_model = GaussianNB()

# Train the model on the training data
nb_model.fit(x_train, y_train)

# Predict on the test data
y_pred_gnb = nb_model.predict(x_test)

In [None]:
# Evaluate the model
print("Naive Bayes Performance Metrics:")
print("Accuracy:", accuracy_score(y_test, y_pred_gnb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_gnb))

plt.figure(figsize=(7, 6)) 
cm=confusion_matrix(y_test, y_pred_gnb)
sns.heatmap(cm,annot=True, cmap='Blues') 
plt.title('Confusion Matrix')
plt.xlabel('Predicted Activity')
plt.ylabel('Actual Activity')
plt.show()

In [None]:
resultsSummarizer(y_pred_gnb, y_test)

### Categorical Naive Bayes

In [None]:
# Initialize the Categorical Naive Bayes model
nb_model_cat = CategoricalNB()

# Train the model on the training data
nb_model_cat.fit(x_train, y_train)

# Predict on the test data
y_pred_cnb = nb_model_cat.predict(x_test)

In [None]:
# Evaluate the model
print("Categorical Naive Bayes Performance Metrics:")
print("Accuracy:", accuracy_score(y_test, y_pred_cnb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_cnb))

# Confusion Matrix
plt.figure(figsize=(7, 6))
cm = confusion_matrix(y_test, y_pred_cnb)
sns.heatmap(cm, annot=True, cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Activity')
plt.ylabel('Actual Activity')
plt.show()

In [None]:
resultsSummarizer(y_pred_cnb, y_test)

# Deep Learning

In [None]:
df10 = pd.read_csv("./datasets/cancer_patient.csv", sep=",",encoding="UTF-8")
df10.head()

In [None]:
df10.Level = df10.Level.replace("Low", 0)
df10.Level = df10.Level.replace("Medium", 1)
df10.Level = df10.Level.replace("High", 2)
df10.Level = df10.Level.astype("int64")

In [None]:
x = df10.drop(["Level","Patient Id","index"], axis = 1)
y = pd.get_dummies(df["Level"])

In [None]:
model = Sequential()

model.add(Dense(8, activation = "relu", input_dim = x.shape[1]))
model.add(Dense(16, activation = "relu"))
model.add(Dropout(0.1))
model.add(Dense(8, activation = "relu"))
model.add(Dense(3, activation = "softmax"))


model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
history = model.fit(x, y, epochs = 50, validation_split = 0.3)

In [None]:
model.summary()

In [None]:
plt.figure(figsize = (12,8))
plt.xlabel("Number of Epochs")
plt.ylabel("Accuracy of Data")
plt.plot(history.history["accuracy"], label = "Training accuracy", marker = "o", color = "darkblue",)
plt.plot(history.history["val_accuracy"],label = "Validation accuracy", marker = "o",color = "r",)
plt.title("Training VS Validation Accuracy", fontsize = 18)
plt.legend()
plt.show()

In [None]:
plt.figure(figsize = (12,8))
plt.xlabel("Number of Epochs")
plt.ylabel("Loss in Data")
plt.plot(history.history["loss"], label= "Training loss", marker = "o", color = "darkblue",)
plt.plot(history.history["val_loss"], label= "Validation loss", marker = "o", color = "r",)
plt.title("Training VS Validation loss", fontsize = 18)
plt.legend()
plt.show()

## ANN

In [None]:
df11 = pd.read_csv("./datasets/cancer_patient.csv", sep=",",encoding="UTF-8")
df11.head()

In [None]:
df11 = df11.drop(columns=['index', 'Patient Id'])

In [None]:
le = LabelEncoder()
df11['Level'] = le.fit_transform(df11['Level'])

In [None]:
X11 = df11.drop(columns=['Level'])
Y11 = df11['Level']

In [None]:
scaler1 = StandardScaler()
X11 = scaler1.fit_transform(X11)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X11, Y11, test_size=0.2, random_state=42)

In [None]:
def preprocess_data(df):
    # Drop unnecessary columns
    columns_to_drop = ['index', 'Patient Id']
    X = df.drop(columns_to_drop + ['Level'], axis=1)
    
    # Convert Gender to numeric if not already
    X['Gender'] = X['Gender'].astype(int)
    
    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Encode the target variable
    le = LabelEncoder()
    y = le.fit_transform(df['Level'])
    
    return X_scaled, y, le

In [None]:
def create_model(input_dim):
    model = Sequential([
        # Input layer
        Dense(256, input_dim=input_dim, 
              kernel_regularizer=regularizers.l2(0.0001)),
        BatchNormalization(),
        Dense(256, activation='relu'),
        Dropout(0.3),
        
        # Hidden layers
        Dense(128, kernel_regularizer=regularizers.l2(0.0001)),
        BatchNormalization(),
        Dense(128, activation='relu'),
        Dropout(0.3),
        
        Dense(64, kernel_regularizer=regularizers.l2(0.0001)),
        BatchNormalization(),
        Dense(64, activation='relu'),
        Dropout(0.2),
        
        # Output layer
        Dense(3, activation='softmax')
    ])
    
    # Compile model
    optimizer = keras.optimizers.Adam(learning_rate=0.001)
    model.compile(
        optimizer=optimizer,
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model

In [None]:
# Training with cross-validation
def train_model(X, y, n_splits=5):
    from sklearn.model_selection import StratifiedKFold
    
    # Calculate class weights
    class_weights = compute_class_weight('balanced', 
                                       classes=np.unique(y), 
                                       y=y)
    class_weight_dict = dict(enumerate(class_weights))
    
    # Callbacks
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=15,
        restore_best_weights=True,
        verbose=1
    )
    
    reduce_lr = ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.2,
        patience=7,
        min_lr=1e-6,
        verbose=1
    )
    
    # Cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    fold_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f'\nFold {fold + 1}/{n_splits}')
        
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        model = create_model(X.shape[1])
        
        history = model.fit(
            X_train, y_train,
            epochs=150,
            batch_size=32,
            validation_data=(X_val, y_val),
            callbacks=[early_stopping, reduce_lr],
            class_weight=class_weight_dict,
            verbose=1
        )
        
        # Evaluate
        score = model.evaluate(X_val, y_val, verbose=0)
        fold_scores.append(score[1])
        print(f'Fold {fold + 1} Validation Accuracy: {score[1]:.4f}')
    
    print(f'\nMean CV Accuracy: {np.mean(fold_scores):.4f} (+/- {np.std(fold_scores):.4f})')
    return model, history

In [None]:
df11 = pd.read_csv("./datasets/cancer_patient.csv", sep=",",encoding="UTF-8")

X_scaled, y, label_encoder = preprocess_data(df11)
model, history = train_model(X_scaled, y)

In [None]:
model.summary()

In [None]:
# Predict on the testing set
y1_pred = model.predict(X_test)
y1_pred_classes = y1_pred.argmax(axis=1) 

In [None]:
resultsSummarizer(y_test, y1_pred_classes)

In [None]:

class_report = classification_report(y_test, y1_pred_classes)

print('Classification Report:')
print(class_report)

In [None]:
plt.figure(figsize = (12,8))
plt.xlabel("Number of Epochs")
plt.ylabel("Accuracy of Data")
plt.plot(history.history["accuracy"], label = "Training accuracy", marker = "o", color = "darkblue",)
plt.plot(history.history["val_accuracy"],label = "Validation accuracy", marker = "o",color = "r",)
plt.title("Training VS Validation Accuracy", fontsize = 18)
plt.legend()
plt.show()

In [None]:
plt.figure(figsize = (12,8))
plt.xlabel("Number of Epochs")
plt.ylabel("Loss in Data")
plt.plot(history.history["loss"], label= "Training loss", marker = "o", color = "darkblue",)
plt.plot(history.history["val_loss"], label= "Validation loss", marker = "o", color = "r",)
plt.title("Training VS Validation loss", fontsize = 18)
plt.legend()
plt.show()