<a href="https://colab.research.google.com/github/Thesis-AfaanOromooChatGPT2025/MedPromptX/blob/main/Medical_Text_Classification_BiLSTM_%7C_BiGRU_%7CConv1D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
falgunipatel19_biomedical_text_publication_classification_path = kagglehub.dataset_download('falgunipatel19/biomedical-text-publication-classification')

print('Data source import complete.')


# Data Overview

For Biomedical text document classification, abstract and full papers(whose length less than or equal to 6 pages) available and used. This dataset focused on long research paper whose page size more than 6 pages.
- Dataset includes cancer documents to be classified into 3 categories: 'Thyroid_Cancer', 'Colon_Cancer', 'Lung_Cancer'.
- Total publications=7569. it has 3 class labels in dataset.
- number of samples in each categories:
    - colon cancer=2579,
    - lung cancer=2180,
    - thyroid cancer=2810

# Import Libraries

In [None]:
import pandas as pd
import numpy as np
from numpy import random
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import os
import time
import re

from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import (Dense, Flatten, SimpleRNN, Conv1D,
                                     Bidirectional, GRU, LSTM,
                                     BatchNormalization, Dropout, Input, MaxPooling1D,
                                     Embedding, TextVectorization)
from tensorflow.keras.losses import (CategoricalCrossentropy,
                                     SparseCategoricalCrossentropy)
from tensorflow.keras.metrics import Accuracy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud

# Functions

In [None]:
import warnings
warnings.filterwarnings('ignore')

plt.style.use('fivethirtyeight')

In [None]:
# Function to plot loss, accuracy and best epoch

def learning_curves_plot(tr_data, start_epoch):
    # Plot the training and validation data
    tacc = tr_data.history['accuracy']
    tloss = tr_data.history['loss']
    vacc = tr_data.history['val_accuracy']
    vloss = tr_data.history['val_loss']

    Epoch_count = len(tacc) + start_epoch
    Epochs = []
    for i in range(start_epoch, Epoch_count):
        Epochs.append(i + 1)

    index_loss = np.argmin(vloss)  # this is the epoch with the lowest validation loss
    val_lowest = vloss[index_loss]
    index_acc = np.argmax(vacc)
    acc_highest = vacc[index_acc]

    plt.style.use('fivethirtyeight')

    sc_label = 'best epoch= ' + str(index_loss + 1 + start_epoch)
    vc_label = 'best epoch= ' + str(index_acc + 1 + start_epoch)

    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 8))

    axes[0].plot(Epochs, tloss, 'r', label='Training loss')
    axes[0].plot(Epochs, vloss, 'g', label='Validation loss')
    axes[0].scatter(index_loss + 1 + start_epoch, val_lowest, s=150, c='blue', label=sc_label)
    axes[0].set_title('Training and Validation Loss')
    axes[0].set_xlabel('Epochs')
    axes[0].set_ylabel('Loss')
    axes[0].legend()

    axes[1].plot(Epochs, tacc, 'r', label='Training Accuracy')
    axes[1].plot(Epochs, vacc, 'g', label='Validation Accuracy')
    axes[1].scatter(index_acc + 1 + start_epoch, acc_highest, s=150, c='blue', label=vc_label)
    axes[1].set_title('Training and Validation Accuracy')
    axes[1].set_xlabel('Epochs')
    axes[1].set_ylabel('Accuracy')
    axes[1].legend()

    plt.tight_layout()
    plt.show()

# Data Exploration

In [None]:
data = pd.read_csv("/kaggle/input/biomedical-text-publication-classification/alldata_1_for_kaggle.csv"
                   , encoding="latin1")

In [None]:
data.head()

In [None]:
data.isna().sum()

In [None]:
data.duplicated().sum()

In [None]:
data.columns

In [None]:
data = data.drop('Unnamed: 0', axis =1)

In [None]:
data.info()

In [None]:
data = data.rename(columns={"0": "Target", "a":"Text"})

In [None]:
data["Text"].iloc[0]

In [None]:
data["Target"].value_counts()

In [None]:
counts = data["Target"].value_counts()

fig, axs = plt.subplots(1, 2, figsize=(14, 6))

ax1 = sns.barplot(x=counts.index, y=counts.values, ax=axs[0])

for i, p in enumerate(ax1.patches):
    ax1.annotate(f'{counts.iloc[i]}',
                 (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha='center', va='bottom',
                 fontsize=10)

axs[0].set_title('Number of Samples per Class')
axs[0].set_xlabel('Classes')
axs[0].set_ylabel('Number of Samples')

axs[1].pie(counts.values,
           autopct="%.1f%%",
           labels=counts.index)
axs[1].set_title('Distribution of Cancer Types')

plt.tight_layout()
plt.show()

In [None]:
thyroid = data[data['Target'] == 'Thyroid_Cancer']['Text'].values
thyroid_text = ' '.join(thyroid)

plt.figure(figsize=(10, 8))
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(thyroid_text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud for Thyroid Cancer')
plt.axis('off')
plt.show()

In [None]:
colon = data[data['Target'] == 'Colon_Cancer']['Text'].values
colon_text = ' '.join(colon)

plt.figure(figsize=(10, 8))
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(colon_text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud for Colon Cancer')
plt.axis('off')
plt.show()

In [None]:
lung = data[data['Target'] == 'Lung_Cancer']['Text'].values
lung_text = ' '.join(lung)

plt.figure(figsize=(10, 8))
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(lung_text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud for Lung Cancer')
plt.axis('off')
plt.show()

# Data Preprocessing

In [None]:
vocab_size = 10000  # Size of the vocabulary
embedding_dim = 128  # Dimension of the word embeddings
max_length = 200  # Maximum length of the sequences

In [None]:
text = data["Text"].values
labels = data["Target"].values

In [None]:
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(data["Text"])

sequences = tokenizer.texts_to_sequences(data["Text"])

padded_sequences = pad_sequences(sequences, maxlen=max_length)

# Data Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2,
                                                    random_state=42, stratify=labels)

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.25,
                                                       random_state=42, stratify=y_train)

print('Train Set Shape: ', X_train.shape)
print('Validation Set Shape: ', X_valid.shape)
print('Test Set Shape: ', X_test.shape)

In [None]:
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_valid = encoder.transform(y_valid)
y_test = encoder.transform(y_test)

y_train = to_categorical(y_train, num_classes=3)
y_valid = to_categorical(y_valid, num_classes=3)
y_test = to_categorical(y_test, num_classes=3)

# Modeling

## SimpleRNN

In [None]:
rnn_model = Sequential()
rnn_model.add(Input(shape=(max_length,)))
rnn_model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
rnn_model.add(SimpleRNN(128))
rnn_model.add(Dense(3, activation='softmax'))

In [None]:
rnn_model.summary()

In [None]:
rnn_model.compile(loss="categorical_crossentropy",
              optimizer="adam",
              metrics=['accuracy'])

In [None]:
early_stopping = EarlyStopping(monitor='val_loss',
                               patience=6,
                               restore_best_weights=True)

reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                              factor=0.2,
                              verbose=1,
                              patience=2,
                              min_lr=1e-6)

callbacks = [early_stopping, reduce_lr]

In [None]:
rnn_history=rnn_model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    epochs=15,
    batch_size=32,
    callbacks=callbacks)

In [None]:
learning_curves_plot(rnn_history, start_epoch=0)

### Evaluation

In [None]:
rnn_loss, rnn_accuracy = rnn_model.evaluate(X_test, y_test, verbose=0)
print("RNN Loss: ", rnn_loss)
print("RNN Test Accuracy: ", rnn_accuracy)

In [None]:
class_labels = ['Thyroid_Cancer', 'Colon_Cancer', 'Lung_Cancer']

In [None]:
y_pred_rnn = np.argmax(rnn_model.predict(X_test), axis=-1)
y_true = np.argmax(y_test, axis=-1)

print(classification_report(y_true, y_pred_rnn, target_names= class_labels))

In [None]:
conf_matrix_rnn = confusion_matrix(y_true, y_pred_rnn)

plot_confusion_matrix(conf_matrix_rnn,
                      class_names= class_labels,
                      show_normed=True,
                      figsize=(8,6),
                      colorbar=True)

plt.title('RNN Confusion Matrix')
plt.show()

## LSTM

In [None]:
lstm_model = Sequential()
lstm_model.add(Input(shape=(max_length,)))
lstm_model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
lstm_model.add(LSTM(128))
lstm_model.add(Dense(3, activation='softmax'))

In [None]:
lstm_model.compile(optimizer='adam',
                   loss='categorical_crossentropy',
                   metrics=['accuracy'])

In [None]:
lstm_model.summary()

In [None]:
lstm_history=lstm_model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    epochs=15,
    batch_size=32)

In [None]:
learning_curves_plot(lstm_history, start_epoch=0)

### Evaluation

In [None]:
lstm_loss, lstm_accuracy = lstm_model.evaluate(X_test, y_test)
print("LSTM Loss: ", lstm_loss)
print("LSTM Test Accuracy: ", lstm_accuracy)

In [None]:
y_pred_lstm = np.argmax(lstm_model.predict(X_test), axis=-1)
y_true = np.argmax(y_test, axis=-1)

print(classification_report(y_true, y_pred_lstm, target_names= class_labels))

In [None]:
conf_matrix_lstm = confusion_matrix(y_true, y_pred_lstm)

plot_confusion_matrix(conf_matrix_lstm,
                      class_names= class_labels,
                      show_normed=True,
                      figsize=(8,6),
                      colorbar=True)

plt.title('LSTM Confusion Matrix')
plt.show()

## LSTM (MultiLayer and Bidirectional)

In [None]:
bilstm_model = Sequential()
bilstm_model.add(Input(shape=(max_length,)))
bilstm_model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
bilstm_model.add(Bidirectional(LSTM(128, return_sequences=True)))
bilstm_model.add(Bidirectional(LSTM(64)))
bilstm_model.add(Dense(3, activation='softmax'))

In [None]:
bilstm_model.compile(optimizer='adam',
                     loss='categorical_crossentropy',
                     metrics=['accuracy'])

In [None]:
bilstm_history=bilstm_model.fit(
    X_train,y_train,
    validation_data=(X_valid, y_valid),
    epochs=15,
    batch_size=32)

In [None]:
learning_curves_plot(bilstm_history, start_epoch=0)

### Evaluation

In [None]:
bilstm_loss, bilstm_accuracy = bilstm_model.evaluate(X_test, y_test)
print("Bi-LSTM Loss: ", bilstm_loss)
print("Bi-LSTM Test Accuracy: ", bilstm_accuracy)

In [None]:
y_pred_bilstm = np.argmax(bilstm_model.predict(X_test), axis=-1)
y_true = np.argmax(y_test, axis=-1)

print(classification_report(y_true, y_pred_bilstm, target_names= class_labels))

In [None]:
conf_matrix_bilstm = confusion_matrix(y_true, y_pred_bilstm)

plot_confusion_matrix(conf_matrix_bilstm,
                      class_names= class_labels,
                      show_normed=True,
                      figsize=(8,6),
                      colorbar=True)

plt.title('Bi-LSTM Confusion Matrix')
plt.show()

## GRU (MultiLayer and Bidirectional)

In [None]:
bigru_model = Sequential()
bigru_model.add(Input(shape=(max_length,)))
bigru_model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
bigru_model.add(Bidirectional(GRU(128, return_sequences=True)))
bigru_model.add(Bidirectional(GRU(64)))
bigru_model.add(Dense(3, activation='softmax'))

In [None]:
bigru_model.compile(optimizer='adam',
                    loss='categorical_crossentropy',
                    metrics=['accuracy'])

In [None]:
bigru_model.summary()

In [None]:
bigru_history=bigru_model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    epochs=15,
    batch_size=32)

In [None]:
learning_curves_plot(bigru_history, start_epoch=0)

### Evaluation

In [None]:
bigru_loss, bigru_accuracy = bigru_model.evaluate(X_test, y_test)
print("Bi-GRU Loss: ", bigru_loss)
print("Bi-GRU Test Accuracy: ", bigru_accuracy)

In [None]:
y_pred_bigru = np.argmax(bigru_model.predict(X_test), axis=-1)
y_true = np.argmax(y_test, axis=-1)

print(classification_report(y_true, y_pred_bigru, target_names= class_labels))

In [None]:
conf_matrix_bigru = confusion_matrix(y_true, y_pred_bigru)

plot_confusion_matrix(conf_matrix_bigru,
                      class_names= class_labels,
                      show_normed=True,
                      figsize=(8,6),
                      colorbar=True)

plt.title('Bi-GRU Confusion Matrix')
plt.show()

## Conv1D

In [None]:
conv1d_model = Sequential()
conv1d_model.add(Input(shape=(max_length,)))
conv1d_model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
conv1d_model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
conv1d_model.add(MaxPooling1D(pool_size=2))
conv1d_model.add(Flatten())
conv1d_model.add(Dense(3, activation='softmax'))

In [None]:
conv1d_model.compile(optimizer='adam',
                     loss='categorical_crossentropy',
                     metrics=['accuracy'])

In [None]:
conv1d_model.summary()

In [None]:
conv1d_history=conv1d_model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    epochs=15,
    batch_size=32)

In [None]:
learning_curves_plot(conv1d_history, start_epoch=0)

### Evaluation

In [None]:
conv1d_loss, conv1d_accuracy = conv1d_model.evaluate(X_test, y_test)
print("Conv1D Loss: ", conv1d_loss)
print("Conv1D Test Accuracy: ", conv1d_accuracy)

In [None]:
y_pred_conv1d = np.argmax(conv1d_model.predict(X_test), axis=-1)
y_true = np.argmax(y_test, axis=-1)

print(classification_report(y_true, y_pred_conv1d, target_names= class_labels))

In [None]:
conf_matrix_conv1d = confusion_matrix(y_true, y_pred_conv1d)

plot_confusion_matrix(conf_matrix_conv1d,
                      class_names= class_labels,
                      show_normed=True,
                      figsize=(8,6),
                      colorbar=True)

plt.title('Conv1D Confusion Matrix')
plt.show()

# Thank you