In [None]:
import numpy as np  
import pandas as pd 
import os  
from PIL import Image  
from tensorflow.keras.applications.densenet import DenseNet121, preprocess_input
from tensorflow.keras.models import Model
from tqdm import tqdm


# Data preprocessing

This dataset contains chest X-ray reports and associated metadata from Indiana University. It includes two key files:

indiana_reports.csv: Contains patient-level information with fields like:
uid: Unique identifier for each patient or study.
findings: Detailed descriptions of the X-ray observations.
impression: A summarized conclusion based on the findings.
indication: Clinical reason for performing the X-ray.
comparison: Notes on any prior studies for comparison.
Additional fields such as MeSH and Problems for medical annotations.
indiana_projections.csv: Lists X-ray image file names and their projections (e.g., frontal, lateral).

In [None]:
import pandas as pd

# Load the dataset 
data = pd.read_csv('/kaggle/input/chest-xrays-indiana-university/indiana_reports.csv')

# Display summary
summary = pd.DataFrame({
    'Column': data.columns,
    'Non-Null Count': data.notnull().sum().values,
    'Data Type': data.dtypes.values,
    'Unique Values': [data[col].nunique() for col in data.columns],
    'Sample Value': [data[col].dropna().iloc[0] for col in data.columns]
})

print("Dataset Summary Table:")
display(summary)

print("\nExample Rows:")
display(data.head(8))



 the data consist of images,and its assosiated reports.
 a report could be applied to one or more image because a patient might have several images taken(frontal/Lateral)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv('/kaggle/input/chest-xrays-indiana-university/indiana_projections.csv')

# Count the number of images associated with each report
image_counts = data.groupby('uid')['filename'].count()

# Count how many reports are associated with each image count
image_count_distribution = image_counts.value_counts().sort_index()

# Plot the distribution
plt.figure(figsize=(10, 6))
plt.bar(image_count_distribution.index, image_count_distribution.values, width=0.6, edgecolor='black')
plt.xlabel('Number of Images per Report', fontsize=14)
plt.ylabel('Number of Reports', fontsize=14)
plt.title('Distribution of Images Associated with Each Report', fontsize=16)
plt.xticks(image_count_distribution.index, fontsize=12)
plt.yticks(fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()



## Overview
The dataset consists of chest X-ray images and corresponding radiology reports . Each report is linked to one or more images a, including projections like frontal and lateral views,and as we see in the graph,most reports connected with two images and less with 1,3 or 4. The goal is to associate each report with exactly two images:
- If there is only one image, it will be duplicated.
- If there are more than two images, we prioritize selecting one frontal and one lateral view. If both are not available, the first two images will be selected.

## Steps
1. **Merge Data**: Combine the `indiana_projections.csv` and `indiana_reports.csv` files using the `uid` column.
2. **Image Selection**:
   - For each report (`uid`), determine the available projections.
   - Select one frontal and one lateral view, if available.
   - If only one image exists, duplicate it.
   - If more than two images exist but only one type of projection, pick the first two images.
3. **Save Processed Dataset**: The resulting dataset is saved as `processed_dataset.csv`.

## Code
The following code implements the above logic:

 


In [None]:
import pandas as pd
from IPython.display import display

# Load the original datasets
projections_df = pd.read_csv('/kaggle/input/chest-xrays-indiana-university/indiana_projections.csv')
reports_df = pd.read_csv('/kaggle/input/chest-xrays-indiana-university/indiana_reports.csv')

# Merge the datasets on 'uid'
data = pd.merge(projections_df, reports_df, on='uid')

# Functio to select exactly two images per report and format the output
def select_images_for_csv(group):
    frontal = group[group['projection'] == 'Frontal']
    lateral = group[group['projection'] == 'Lateral']

    if len(group) == 1:  # Only one image, duplicate it
        return {
            "Person_id": group.iloc[0]["uid"],
            "Image1": group.iloc[0]["filename"],
            "Image2": group.iloc[0]["filename"],
            "Report": group.iloc[0]["findings"]
        }

    if len(frontal) > 0 and len(lateral) > 0:  # At least one frontal and one lateral
        return {
            "Person_id": group.iloc[0]["uid"],
            "Image1": frontal.iloc[0]["filename"],
            "Image2": lateral.iloc[0]["filename"],
            "Report": group.iloc[0]["findings"]
        }

    # If only one type of projection exists or more than two images, pick the first two
    selected_images = group.iloc[:2]
    return {
        "Person_id": group.iloc[0]["uid"],
        "Image1": selected_images.iloc[0]["filename"],
        "Image2": selected_images.iloc[1]["filename"],
        "Report": group.iloc[0]["findings"]
    }

# Apply the selection function and create the desired CSV
output_data = data.groupby('uid').apply(select_images_for_csv).tolist()
output_df = pd.DataFrame(output_data)

# Save the new CSV file
output_df.to_csv('formatted_dataset.csv', index=False)

print("Formatted dataset saved as 'formatted_dataset.csv'.")


# Load the dataset
formatted_data = pd.read_csv('formatted_dataset.csv')

# Display the first 10 rows of the dataset as a visually pleasing table
display(formatted_data.head(10))

### preprocess the data

We need to remove all numbers, stop words from our text data. Also we will convert all the text in lower case and perform deconstruction on the each report.

In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords

# Load stop words
stop_words = set(stopwords.words('english'))

# Define preprocessing functions
def lowercase(text):
    '''Converts to lowercase'''
    new_text = []
    for line in text:
        if isinstance(line, str):  # Check if the line is a string
            new_text.append(line.lower())
        else:
            new_text.append("")  # Replace non-string values with an empty string
    return new_text

def decontractions(text):
    '''Performs decontractions in the doc'''
    new_text = []
    for phrase in text:
        if isinstance(phrase, str):  # Check if the phrase is a string
            phrase = re.sub(r"won't", "will not", phrase)
            phrase = re.sub(r"can\'t", "can not", phrase)
            phrase = re.sub(r"couldn\'t", "could not", phrase)
            phrase = re.sub(r"shouldn\'t", "should not", phrase)
            phrase = re.sub(r"wouldn\'t", "would not", phrase)
            # general
            phrase = re.sub(r"n\'t", " not", phrase)
            phrase = re.sub(r"\'re", " are", phrase)
            phrase = re.sub(r"\'s", " is", phrase)
            phrase = re.sub(r"\'d", " would", phrase)
            phrase = re.sub(r"\'ll", " will", phrase)
            phrase = re.sub(r"\'t", " not", phrase)
            phrase = re.sub(r"\'ve", " have", phrase)
            phrase = re.sub(r"\'m", " am", phrase)
        new_text.append(phrase)
    return new_text

def rem_punctuations(text):
    '''Removes punctuations'''
    punctuations = '''!()-[]{};:'"\\,<>/?@#$%^&*~'''
    new_text = []
    for line in text:
        if isinstance(line, str):  # Check if the line is a string
            for char in line:
                if char in punctuations:
                    line = line.replace(char, "")
            new_text.append(' '.join(e for e in line.split()))
        else:
            new_text.append("")
    return new_text

def rem_numbers(text):
    '''Removes numbers and irrelevant text like xxxx*'''
    new_text = []
    for line in text:
        if isinstance(line, str):  # Check if the line is a string
            temp = re.sub(r'x*', '', line)
            new_text.append(re.sub(r'\d', '', temp))
        else:
            new_text.append("")
    return new_text

def rem_stopwords(text):
    '''Removes stop words from the text but preserves negations.'''
    negations = {'no', 'not'}
    new_text = []
    for line in text:
        if isinstance(line, str):  # Check if the line is a string
            temp = line.split()
            temp2 = [word for word in temp if word not in stop_words or word in negations]
            new_text.append(' '.join(temp2))
        else:
            new_text.append("")
    return new_text

def text_preprocessing(text):
    '''Combines all the preprocess functions'''
    new_text = lowercase(text)
    new_text = decontractions(new_text)
    new_text = rem_punctuations(new_text)
    new_text = rem_numbers(new_text)
    new_text = rem_stopwords(new_text)
    return new_text

# Load the dataset
df = pd.read_csv('/kaggle/working/formatted_dataset.csv')

# Fill NaN values in 'Report' column with empty strings
df['Report'] = df['Report'].fillna("")

# Preprocess the 'Report' column
df['Report'] = text_preprocessing(df['Report'])

# Save the preprocessed dataset
df.to_csv('preprocessed_dataset2.csv', index=False)

print("Preprocessing complete. Dataset saved as 'preprocessed_dataset2.csv'.")


#### we will remove rows with empty report field 

In [None]:
import pandas as pd

# Load the dataset
file_path = '/kaggle/working/preprocessed_dataset2.csv'  
df = pd.read_csv(file_path)

# Count the initial number of rows
initial_count = len(df)

# Remove rows with empty 'Report' fields
df_cleaned = df[df['Report'].notnull() & (df['Report'].str.strip() != '')]

# Count the final number of rows
final_count = len(df_cleaned)

# Calculate the number of rows removed
rows_removed = initial_count - final_count

# Print the result
print(f"Number of rows removed: {rows_removed}")

# Save the cleaned dataset
df_cleaned.to_csv('cleaned_preprocessed_dataset2.csv', index=False)


### Final step

for NLP tasks,we must add 'startseq' and 'endseq' tokens to each sentence, to prepare the Report data for sequence modeling tasks,it helps the model know where a sentence starts and ends, which is critical for tasks like text generation or captioning

In [None]:
import pandas as pd

# Load the dataset
file_path = '/kaggle/working/cleaned_preprocessed_dataset2.csv'  # Path to your file
df = pd.read_csv(file_path)

# Define the remodelling function
def remodelling(x): 
    '''Adds start and end tokens to a sentence'''
    return 'startseq ' + str(x) + ' endseq'

# Apply the remodelling function to the 'Report' column
df['Report'] = df['Report'].apply(lambda x: remodelling(x))

# Save the modified dataset
df.to_csv('/kaggle/working/Ready_dataset2.csv', index=False)

print("Start and end tokens added. Modified dataset saved as '/kaggle/working/Ready_dataset2.csv'.")


we will split our data into training,testing and cross validation (cv)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('/kaggle/working/Ready_dataset2.csv')

# Split dataset
train, temp = train_test_split(df, test_size=0.3, random_state=42)  # 70% train, 30% temp
val, test = train_test_split(temp, test_size=0.5, random_state=42)  # 15% val, 15% test

# Save the splits 
train.to_csv('train.csv', index=False)
val.to_csv('val.csv', index=False)
test.to_csv('test.csv', index=False)


we have done all the preprocessing needed and we have our final split of training,testing and cv

We will  make sure that our images are in a good shape to feed them to the model for feature extraction

Implement an Encoder-Decoder architecture:
Encoder: Use a CNN (CheXNet) which produces a context vector by taking in our image features.
Decoder: Use an RNN ( LSTM or GRU) .
and use beam search to generate the reports

### Obtaining Image Features


Images and reports are the input to the model we will convert every image into fixed size vector to be fed to the model, we will use transfer learning for that.



We will be using pre trained cheXnet model to extract features from images, cheXnet is a 121-layer convolutional neural network trained on chest xray to classify 14 diseases however our purpose is not to classify the images but to get the features for each image so we will discard the last classification layer.

In [None]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.applications import densenet
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from PIL import Image
from skimage.transform import resize
import pickle
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')


### Load Datasets

In [None]:
# Load the datasets 
train_data = pd.read_csv('/kaggle/input/chstx-extracted-features-and-other-files/train-test-cv_split/train.csv')
val_data = pd.read_csv('/kaggle/input/chstx-extracted-features-and-other-files/train-test-cv_split/val.csv')
test_data = pd.read_csv('/kaggle/input/chstx-extracted-features-and-other-files/train-test-cv_split/test.csv')

# Check the data
print("Train Data Shape:", train_data.shape)
print("Validation Data Shape:", val_data.shape)
print("Test Data Shape:", test_data.shape)
train_data.head()


### preprocess Datasets

In [None]:
BASE_PATH = '/kaggle/input/chest-xrays-indiana-university/images/images_normalized/'


In [None]:
def load_image(img_path):
    # Prepend the base path to the given image path
    full_path = BASE_PATH + img_path
    image = Image.open(full_path)
    img_array = np.asarray(image.convert("RGB"))
    img_array = resize(img_array, (224, 224, 3))  # Resize to DenseNet input size
    img_array = np.expand_dims(img_array, axis=0)  # Add batch dimension
    return densenet.preprocess_input(img_array)


### download cheXnet weights

we have already downloaded the weights and saved it in the data set we created(to avoid repeating runs)

### Load Pretrained CheXNet Model

We added an average pooling layer at the end to avoid the model learning the order of the images. Without pooling, the model might think that image1 always comes before image2, which isn’t true. The pooling makes the model ignore the positions of features and focus only on their overall patterns, making the feature extraction better and more flexible.(in the feature extraction)

In [None]:
# Load DenseNet model without the final classification layer
chexNet = densenet.DenseNet121(include_top=False, weights=None, input_shape=(224, 224, 3), pooling="avg")

# Add custom output layer for feature extraction
X = chexNet.output
X = tf.keras.layers.Dense(14, activation="sigmoid", name="predictions")(X)
model = tf.keras.Model(inputs=chexNet.input, outputs=X)

# Load pre-trained weights for CheXNet
model.load_weights('/kaggle/input/chstx-extracted-features-and-other-files/CheXNet_Keras_weights.h5')

# Feature extractor model
feature_extractor = tf.keras.Model(inputs=model.input, outputs=model.layers[-2].output)
feature_extractor.summary()


### feature extraction

no need to run it each time(the features were saved and uploaded as input for faster run)

In [None]:
from tqdm.notebook import tqdm  

# Feature extraction process
def extract_features(data):
    features = {}
    first_key = None
    for i, (key, img1, img2, report) in enumerate(
        tqdm(data.itertuples(index=False), total=len(data), desc="Processing Images")
    ):
        # Load and process both images
        img1_features = feature_extractor.predict(load_image(img1), verbose=0)
        img2_features = feature_extractor.predict(load_image(img2), verbose=0)

        # Combine the features from both images
        combined_features = np.concatenate((img1_features, img2_features), axis=1)

        # Save combined features using the unique key
        features[key] = combined_features

        # Print the shape of the first feature vector only
        if i == 0:
            first_key = key
            print(f"First Feature Vector Shape for Key '{first_key}': {combined_features.shape}")

    return features

# Dictionary to store all features
all_features = {}

# Extract features for the entire dataset
print("Extracting features for the training set...")
all_features['train'] = extract_features(train_data)

print("Extracting features for the validation set...")
all_features['val'] = extract_features(val_data)

print("Extracting features for the test set...")
all_features['test'] = extract_features(test_data)

# Save all extracted features into a single file
print("Saving all extracted features into one file...")
with open('all_features2.pickle', 'wb') as f:
    pickle.dump(all_features, f)

print("Feature extraction and saving completed!")


verification for quality

In [None]:
import pickle
import numpy as np

# File paths
features_file = '/kaggle/input/chstx-extracted-features-and-other-files/all_features2.pickle'  
train_csv = '/kaggle/input/chstx-extracted-features-and-other-files/train-test-cv_split/train.csv'  
val_csv = '/kaggle/input/chstx-extracted-features-and-other-files/train-test-cv_split/val.csv'     
test_csv = '/kaggle/input/chstx-extracted-features-and-other-files/train-test-cv_split/test.csv'   

# Load the datasets
train_data = pd.read_csv(train_csv)
val_data = pd.read_csv(val_csv)
test_data = pd.read_csv(test_csv)

# Load the extracted features
with open(features_file, 'rb') as f:
    all_features = pickle.load(f)

# Quality checks
def check_features():
    results = {}
    for dataset_name, data, csv_data in zip(
        ['train', 'val', 'test'], 
        [all_features['train'], all_features['val'], all_features['test']], 
        [train_data, val_data, test_data]
    ):
        print(f"\n--- Checking {dataset_name} dataset ---")

        # Check the number of keys
        num_keys = len(data)
        expected_samples = len(csv_data)
        print(f"Number of keys: {num_keys}, Expected: {expected_samples}")
        if num_keys != expected_samples:
            print(f"WARNING: Mismatch in number of keys for {dataset_name}!")

        # Check the feature vector shape
        if num_keys > 0:
            key_to_check = list(data.keys())[0]
            feature_shape = data[key_to_check].shape
            print(f"Feature vector shape for key '{key_to_check}': {feature_shape}")
            if feature_shape != (1, 2048):
                print(f"WARNING: Feature vector shape is incorrect for {dataset_name}!")

            # Check feature vector values
            feature_values = data[key_to_check]
            if np.all(feature_values == 0):
                print(f"WARNING: All feature values are zero for key '{key_to_check}'!")
            elif np.isnan(feature_values).any():
                print(f"WARNING: Feature values contain NaN for key '{key_to_check}'!")
            else:
                print(f"Feature values look normal for key '{key_to_check}'.")

        else:
            print(f"WARNING: {dataset_name} dataset is empty!")
    
    return results

# Run checks
check_features()


## prepare text data

Even though we did preprocessing on the text we need to convert it into victors

#### Segregate the Dataset

This step splits the data into:

X:  image feature IDs[image referance].

y: Medical reports (text).

In [None]:
# Segregate the dataset into inputs (features) and outputs (reports)
X_train = train_data['Person_id']
X_test = test_data['Person_id']
X_val = val_data['Person_id']

y_train = train_data['Report']
y_test = test_data['Report']
y_val = val_data['Report']

print(f"Training Set - X: {len(X_train)}, y: {len(y_train)}")
print(f"Validation Set - X: {len(X_val)}, y: {len(y_val)}")
print(f"Test Set - X: {len(X_test)}, y: {len(y_test)}")


#### Tokenize the Text Data

Converts text reports into numeric tokens.
Vocabulary size: Total unique words in y_train

The fit_on_texts method reads all the text data and creates a vocabulary of unique tokens.

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Initialize and fit the tokenizer on the training set reports
tokenizer = Tokenizer(filters='!"#$%&()*+,-/:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(y_train.values)

print(f"Vocabulary Size: {len(tokenizer.word_index)}")
print(f"Example Tokenized Report: {tokenizer.texts_to_sequences([y_train.values[0]])}")


##### Define Padding Size

Sets a maximum length for all sequences.

In [None]:
# Define maximum sequence length for padding
padding_size = 153  # Maximum report length
vocab_size = len(tokenizer.word_index.keys()) + 1  # Add 1 for padding token

print(f"Padding Size: {padding_size}, Vocabulary Size: {vocab_size}")

#### Load Pre-trained GloVe Vectors

Loads pre-trained GloVe embeddings (e.g., 300-dimensional vectors).

In [None]:
import pickle

# Load GloVe vectors
with open('/kaggle/input/chstx-extracted-features-and-other-files/glove_vectors300d.pickle', 'rb') as f:  
    glove_vectors = pickle.load(f)

print(f"Sample GloVe Vector for 'lungs': {glove_vectors.get('lungs', 'Not found')}")


#### Create the Embedding Matrix  

Embedding matrix: Maps tokens to their GloVe vectors.

In [None]:
import numpy as np

# Initialize embedding matrix
embedding_matrix = np.zeros((vocab_size, 300))  # 300 dimensions from GloVe

# Populate embedding matrix with GloVe vectors
for word, i in tokenizer.word_index.items():
    if word in glove_vectors:
        embedding_matrix[i] = glove_vectors[word]

print(f"Shape of Embedding Matrix: {embedding_matrix.shape}")
print(f"Sample Embedding Vector: {embedding_matrix[tokenizer.word_index['lungs']]}")


save (Tokenizer, Padding Size, Vocabulary Size, and Embedding Matrix)for later use 

In [None]:
import pickle
import numpy as np

# Save Tokenizer, Padding Size, Vocab Size, and Embedding Matrix together
save_data = {
    'tokenizer': tokenizer,
    'padding_size': padding_size,
    'vocab_size': vocab_size,
    'embedding_matrix': embedding_matrix
}

with open('/kaggle/working/tokenizer_embedding_config.pickle', 'wb') as handle:
    pickle.dump(save_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

print("Tokenizer, padding size, vocab size, and embedding matrix saved successfully.")


#### dataset generator

In [None]:
# Define the batch size
BATCH_SIZE = 12

In [None]:
# Load the extracted features from the pickle file
with open('/kaggle/input/chstx-extracted-features-and-other-files/all_features2.pickle', 'rb') as f:
    cheXnet_Features = pickle.load(f)

In [None]:
def load_image(id_, report, dataset_name="train"):
    """
    Loads the Image Features with their corresponding IDs from cheXnet_Features.
    """
    id_int = int(id_)  # Ensure ID is an integer
    img_feature = cheXnet_Features[dataset_name].get(id_int, None)  # Use dataset_name to access the correct subset
    if img_feature is None:
        print(f"Warning: No feature found for ID: {id_int} in {dataset_name} dataset. Skipping...")
        return None, None  # Return None for missing features
    return img_feature[0], report

Verification(locate possible errors)

In [None]:
# Function to test load_image exhaustively
def test_load_image():
    # Sample multiple IDs from train_data
    print("\n--- Verifying load_image Function ---")
    for idx in range(5):  # Testing first 5 IDs for brevity
        sample_id = str(train_data['Person_id'].iloc[idx])  # Ensure the ID is treated as a string
        sample_report = train_data['Report'].iloc[idx]
        print(f"\nTesting ID: {sample_id}, Report: {sample_report[:50]}...")

        # Try loading the feature
        try:
            sample_feature, sample_report_returned = load_image(sample_id, sample_report)
            print(f"Feature Shape: {sample_feature.shape}")
            print(f"Report Match: {sample_report == sample_report_returned}")
        except ValueError as e:
            print(f"Error: {e}")

    # Test for a missing ID
    print("\nTesting for Missing ID...")
    try:
        invalid_id = "9999999"  # Non-existent ID
        load_image(invalid_id, "Test Report")
    except ValueError as e:
        print(f"Handled Missing ID Correctly: {e}")

    # Test for an unexpected ID type
    print("\nTesting for Unexpected ID Type...")
    try:
        invalid_type_id = 123.456  # Float instead of string/int
        load_image(invalid_type_id, "Test Report")
    except Exception as e:
        print(f"Handled Unexpected ID Type: {e}")

# Run the test
test_load_image()


In [None]:
def dataset_generator(img_name, caption, dataset_name):
    """
    Creates a TensorFlow dataset generator that pairs image features with corresponding captions.
    """

    def map_function(item1, item2):
        # Load features and filter out None values
        img_feature, report = load_image(item1, item2, dataset_name)
        if img_feature is None:  # Skip missing features
            return tf.constant([], dtype=tf.float32), tf.constant("", dtype=tf.string)
        return img_feature, report

    dataset = tf.data.Dataset.from_tensor_slices((img_name, caption))

    # Use map to load the numpy files in parallel
    dataset = dataset.map(
        lambda item1, item2: tf.numpy_function(map_function, [item1, item2], [tf.float32, tf.string]),
        num_parallel_calls=tf.data.experimental.AUTOTUNE
    )

    # Filter out invalid (empty) entries
    dataset = dataset.filter(lambda x, y: tf.not_equal(tf.size(x), 0))

    # Shuffle and batch the dataset
    dataset = dataset.shuffle(500).batch(2).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)#batch was 12 but nitb changed it to 2 to chk if it is the problem of slowness

    return dataset


In [None]:
def verify_generator(generator, dataset_name, num_samples=3):
    """
    Verifies the dataset generator by printing feature shapes and corresponding reports.
    """
    print(f"\n--- Verifying {dataset_name} Generator ---")
    for batch_num, (features, reports) in enumerate(generator.take(num_samples)):
        print(f"Batch {batch_num + 1}:")
        print(f" - Feature Shape: {features.shape}")
        print(f" - Sample Report: {reports.numpy()[0].decode('utf-8')}")
    print(f"{dataset_name} generator verification completed.")


In [None]:
# Re-generate datasets
train_generator = dataset_generator(train_data['Person_id'].values, train_data['Report'].values, "train")
val_generator = dataset_generator(val_data['Person_id'].values, val_data['Report'].values, "val")
test_generator = dataset_generator(test_data['Person_id'].values, test_data['Report'].values, "test")

# Verify each generator
verify_generator(train_generator, "train")
verify_generator(val_generator, "validation")
verify_generator(test_generator, "test")


Extra verification(irrelevant)

In [None]:
# Function to check tokenization consistency
def verify_tokenizer(tokenizer, sample_texts, num_samples=10):
    print("\n--- Tokenizer Verification ---")
    
    # Show a few words and their corresponding tokens
    for text in sample_texts[:num_samples]:
        tokens = tokenizer.texts_to_sequences([text])[0]
        print(f"\nOriginal Text: {text}")
        print(f"Tokenized Sequence: {tokens}")
        
        # Reverse mapping: tokens back to words
        reverse_word_index = {v: k for k, v in tokenizer.word_index.items()}
        reconstructed_text = ' '.join([reverse_word_index.get(token, "?") for token in tokens])
        print(f"Reconstructed Text: {reconstructed_text}")

    # Random check for specific words
    words_to_check = ['heart', 'lungs', 'effusion', 'normal']
    print("\nSpecific Word Token Mapping:")
    for word in words_to_check:
        token = tokenizer.word_index.get(word, None)
        if token:
            print(f"'{word}' → Token ID: {token}")
        else:
            print(f"Word '{word}' not found in tokenizer vocabulary!")

# Sample text data for verification
sample_reports = y_train.sample(5).values  # Randomly sample 5 reports from the training set

# Run verification
verify_tokenizer(tokenizer, sample_reports)


##### Bytes to string

This function converts byte-encoded report text data (output by the dataset generator) back into standard string format.  
TensorFlow's `tf.numpy_function` outputs text data in bytes, which must be decoded to strings for tokenization and model input.

In [None]:
def bytes_to_string(arr):
    
    for i in range(len(arr)):
        arr[i] = arr[i].decode('utf-8')  # Decode bytes to string
    return arr


In [None]:
def convert(images, reports):
    """
    Converts batches of images and reports into training-ready format:
    - Images remain as features.
    - Reports are split into input and output sequences for the model.
    """
    imgs = []
    in_reports = []
    out_reports = []
    
    for i in range(len(images)):
        # Convert report text into a sequence of tokens
        sequence = [tokenizer.word_index.get(word, 0) for word in reports[i].split()]
        
        # Generate input-output pairs for the model
        for j in range(1, len(sequence)):
            in_seq = sequence[:j]  # Input: partial sequence
            out_seq = sequence[j]  # Output: next word
            
            # One-hot encode the output word
            out_seq = tf.keras.utils.to_categorical(out_seq, num_classes=vocab_size)
            
            # Append to lists
            imgs.append(images[i])           # Corresponding image feature
            in_reports.append(in_seq)        # Input text sequence
            out_reports.append(out_seq)      # One-hot encoded next word

    # Pad input sequences to have uniform length
    in_reports_padded = tf.keras.preprocessing.sequence.pad_sequences(in_reports, maxlen=padding_size, padding='post')
    
    return np.array(imgs), np.array(in_reports_padded), np.array(out_reports)


verification

In [None]:
# Take a sample batch from the train_generator
for img_batch, report_batch in train_generator.take(1):
    # Decode byte reports back to text
    decoded_reports = bytes_to_string(report_batch.numpy())

    # Convert the batch into input-output pairs for training
    imgs, in_reports, out_reports = convert(img_batch.numpy(), decoded_reports)

    # Verification Outputs
    print(f"Image Batch Shape: {imgs.shape}")
    print(f"Input Reports Shape: {in_reports.shape}")
    print(f"Output Reports Shape (One-Hot): {out_reports.shape}")

    # Show a sample report with input-output pairs
    print(f"\nSample Decoded Report:\n{decoded_reports[0]}")
    print(f"\nSample Input Sequence:\n{in_reports[0]}")
    print(f"\nSample Output Word (One-Hot):\n{out_reports[0]}")
    
    # Verify the One-Hot Vector
    print(f"\nSum of One-Hot Vector (should be 1): {np.sum(out_reports[0])}")
    print(f"Index of '1' in One-Hot Vector: {np.argmax(out_reports[0])}")


Now that we have verified everything we will start working on the model