Loading Annotations and Preprocessing Images

In [2]:
import os 
import json 
import cv2 
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split 
import matplotlib.pyplot as plt
import gc
import torch
import warnings
warnings.filterwarnings("ignore")

In [3]:
def clear_gpu_memory(): 
    torch.cuda.empty_cache() 
    gc.collect()

In [4]:
annotation_path = r'/kaggle/input/iuxray/iu_xray/annotation.json'
with open(annotation_path, 'r') as f: 
    annotations = json.load(f)

In [5]:
print("Annotation keys:", annotations.keys()) 
print("First record in train:", annotations['train'][0]) 
print("First record in test:", annotations['test'][0]) 
print("First record in val:", annotations['val'][0])

Annotation keys: dict_keys(['train', 'val', 'test'])
First record in train: {'id': 'CXR2384_IM-0942', 'report': 'The heart size and pulmonary vascularity appear within normal limits. A large hiatal hernia is noted. The lungs are free of focal airspace disease. No pneumothorax or pleural effusion is seen. Degenerative changes are present in the spine.', 'image_path': ['CXR2384_IM-0942/0.png', 'CXR2384_IM-0942/1.png'], 'split': 'train'}
First record in test: {'id': 'CXR3030_IM-1405', 'report': 'Normal cardiomediastinal silhouette. There is no focal consolidation. There are no XXXX of a large pleural effusion. There is no pneumothorax. There is no acute bony abnormality seen.', 'image_path': ['CXR3030_IM-1405/0.png', 'CXR3030_IM-1405/1.png'], 'split': 'test'}
First record in val: {'id': 'CXR2279_IM-0865', 'report': 'Heart size is enlarged. The aorta is unfolded. Otherwise the mediastinal contour is normal. There are streaky bibasilar opacities. There are no nodules or masses. No visible p

In [6]:
data = [] 
for split in ['train', 'test', 'val']: 
    for record in annotations[split]: 
        for image_path in record['image_path']: 
            full_image_path = os.path.join(r'/kaggle/input/iuxray/iu_xray/images', image_path) 
            data.append({ 
                'id': record['id'], 
                'report': record['report'], 
                'image_path': full_image_path, 
                'split': split 
            })

In [7]:
df = pd.DataFrame(data)

In [8]:
print("First few rows of the dataframe:") 
print(df.head())

First few rows of the dataframe:
                id                                             report  \
0  CXR2384_IM-0942  The heart size and pulmonary vascularity appea...   
1  CXR2384_IM-0942  The heart size and pulmonary vascularity appea...   
2  CXR2926_IM-1328  Cardiac and mediastinal contours are within no...   
3  CXR2926_IM-1328  Cardiac and mediastinal contours are within no...   
4  CXR1451_IM-0291  Left lower lobe calcified granuloma. Heart siz...   

                                          image_path  split  
0  /kaggle/input/iuxray/iu_xray/images/CXR2384_IM...  train  
1  /kaggle/input/iuxray/iu_xray/images/CXR2384_IM...  train  
2  /kaggle/input/iuxray/iu_xray/images/CXR2926_IM...  train  
3  /kaggle/input/iuxray/iu_xray/images/CXR2926_IM...  train  
4  /kaggle/input/iuxray/iu_xray/images/CXR1451_IM...  train  


In [9]:
missing_files = [] 
for index, row in df.iterrows(): 
    if not os.path.exists(row['image_path']): 
        missing_files.append(row['image_path'])

In [10]:
print(f"Number of missing files: {len(missing_files)}") 
print("Sample missing files:", missing_files[:5])

Number of missing files: 0
Sample missing files: []


In [11]:
def preprocess_image(image_path): 
    if not os.path.exists(image_path): 
        print(f"Image not found: {image_path}") 
        return None 
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE) 
    if image is None: 
        print(f"Failed to read image: {image_path}") 
        return None 
    image = cv2.resize(image, (224, 224)) # Resize to 224x224 (typical for CNNs) 
    image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB) # Convert to 3-channel (RGB) 
    image = image / 255.0 # Normalize 
    return image

In [12]:
df['image'] = df['image_path'].apply(preprocess_image)

In [13]:
df = df.dropna(subset=['image'])

In [14]:
train_df = df[df['split'] == 'train'] 
val_df = df[df['split'] == 'val'] 
test_df = df[df['split'] == 'test']

In [15]:
train_images = np.array(train_df['image'].tolist()) 
val_images = np.array(val_df['image'].tolist()) 
test_images = np.array(test_df['image'].tolist())

In [16]:
print(f"Train images shape: {train_images.shape}") 
print(f"Validation images shape: {val_images.shape}") 
print(f"Test images shape: {test_images.shape}")

Train images shape: (4138, 224, 224, 3)
Validation images shape: (592, 224, 224, 3)
Test images shape: (1180, 224, 224, 3)


In [17]:
train_df.to_pickle('train_data.pkl') 
val_df.to_pickle('val_data.pkl') 
test_df.to_pickle('test_data.pkl')

Tokenize and Preprocess Reports

In [18]:
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences 
from tensorflow.keras.utils import to_categorical

In [19]:
tokenizer = Tokenizer(num_words=10000)  
tokenizer.fit_on_texts(train_df['report'].tolist())

In [20]:
encoded_reports = tokenizer.texts_to_sequences(train_df['report'].tolist()) 
max_length = max(len(r) for r in encoded_reports) 
padded_reports = pad_sequences(encoded_reports, maxlen=max_length, padding='post')

In [21]:
vocab_size = len(tokenizer.word_index) + 1 
categorical_reports = np.array([to_categorical(report, num_classes=vocab_size) for report in padded_reports])

In [22]:
print(f"Vocabulary size: {vocab_size}") 
print(f"Max report length: {max_length}") 
print(f"Padded reports shape: {padded_reports.shape}") 
print(f"Categorical reports shape: {categorical_reports.shape}")

Vocabulary size: 1424
Max report length: 150
Padded reports shape: (4138, 150)
Categorical reports shape: (4138, 150, 1424)


Model Building

In [23]:
from tensorflow.keras.applications import DenseNet121 
from tensorflow.keras.models import Model 
from tensorflow.keras.layers import Input, Dense, Flatten, RepeatVector

In [24]:
densenet = DenseNet121(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

In [25]:
for layer in densenet.layers: 
    layer.trainable = False

In [26]:
encoder_input = Input(shape=(224, 224, 3)) 
densenet_output = densenet(encoder_input) 
flat = Flatten()(densenet_output) 
dense = Dense(256, activation='relu')(flat) 
repeat = RepeatVector(max_length)(dense) 
encoder = Model(inputs=encoder_input, outputs=repeat)

In [27]:
encoder.summary()

In [28]:
from tensorflow.keras.layers import Input, LSTM, TimeDistributed, Dense

In [29]:
decoder_input = Input(shape=(max_length, 256)) 
lstm1 = LSTM(128, return_sequences=True)(decoder_input) 
lstm2 = LSTM(128, return_sequences=True)(lstm1) 
output = TimeDistributed(Dense(vocab_size, activation='softmax'))(lstm2) 
decoder = Model(inputs=decoder_input, outputs=output)

In [30]:
decoder.summary()

In [31]:
combined_input = encoder.output 
combined_output = decoder(combined_input)

In [32]:
caption_model = Model(inputs=encoder.input, outputs=combined_output) 
caption_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [33]:
caption_model.summary()

Training the Model

In [34]:
from tensorflow.keras.utils import Sequence

In [35]:
class DataGenerator(Sequence): 
    def __init__(self, df, tokenizer, batch_size=8, shuffle=True): # Further reducing batch size 
        self.df = df 
        self.batch_size = batch_size 
        self.shuffle = shuffle 
        self.indexes = np.arange(len(self.df)) 
        self.tokenizer = tokenizer 
        self.on_epoch_end() 
        
    def __len__(self): 
        return int(np.floor(len(self.df) / self.batch_size)) 
        
    def __getitem__(self, index): 
        batch_indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size] 
        batch_df = self.df.iloc[batch_indexes] 
        images = np.array([preprocess_image(path) for path in batch_df['image_path']]) 
        reports = batch_df['report'].tolist() 
        encoded_reports = self.tokenizer.texts_to_sequences(reports) 
        padded_reports = pad_sequences(encoded_reports, maxlen=max_length, padding='post') 
        categorical_reports = np.array([to_categorical(report, num_classes=vocab_size) for report in padded_reports]) 
        clear_gpu_memory() # Clear GPU memory 

        return images, categorical_reports 
        
    def on_epoch_end(self): 
        if self.shuffle: 
            np.random.shuffle(self.indexes)

In [36]:
train_generator = DataGenerator(train_df, tokenizer, batch_size=8) 
val_generator = DataGenerator(val_df, tokenizer, batch_size=8)

In [37]:
import tensorflow as tf

In [38]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  2


In [39]:
history = caption_model.fit( 
    train_generator,
    epochs=10, 
    validation_data=val_generator 
)

Epoch 1/10
[1m517/517[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m300s[0m 526ms/step - accuracy: 0.7753 - loss: 2.0228 - val_accuracy: 0.7978 - val_loss: 1.1772
Epoch 2/10
[1m517/517[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 496ms/step - accuracy: 0.7938 - loss: 1.1965 - val_accuracy: 0.7991 - val_loss: 1.1518
Epoch 3/10
[1m517/517[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m256s[0m 488ms/step - accuracy: 0.7948 - loss: 1.1932 - val_accuracy: 0.8000 - val_loss: 1.1552
Epoch 4/10
[1m517/517[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m255s[0m 487ms/step - accuracy: 0.7965 - loss: 1.1725 - val_accuracy: 0.8000 - val_loss: 1.1402
Epoch 5/10
[1m517/517[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m255s[0m 487ms/step - accuracy: 0.7958 - loss: 1.1864 - val_accuracy: 0.7999 - val_loss: 1.1388
Epoch 6/10
[1m517/517[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m249s[0m 475ms/step - accuracy: 0.7970 - loss: 1.1678 - val_accuracy: 0.8002 - val_loss: 1.1429
Epoc

Evaluating the Model

In [40]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [41]:
test_generator = DataGenerator(test_df, tokenizer, batch_size=16)

In [42]:
predicted_reports = [] 
true_reports = []

In [43]:
for i in range(len(test_generator)): 
    images, true_categorical_reports = test_generator[i] 
    true_reports.extend(np.argmax(true_categorical_reports, axis=-1)) 
    predictions = caption_model.predict(images) 
    predicted_reports.extend(np.argmax(predictions, axis=-1))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 112ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 103ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 121ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 121ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s

In [44]:
true_reports_flat = [item for sublist in true_reports for item in sublist] 
predicted_reports_flat = [item for sublist in predicted_reports for item in sublist]

In [45]:
precision = precision_score(true_reports_flat, predicted_reports_flat, average='weighted') 
recall = recall_score(true_reports_flat, predicted_reports_flat, average='weighted') 
f1 = f1_score(true_reports_flat, predicted_reports_flat, average='weighted')

In [46]:
print(f'Precision: {precision}') 
print(f'Recall: {recall}') 
print(f'F1 Score: {f1}')

Precision: 0.7305527756108249
Recall: 0.8209303652968036
F1 Score: 0.7719203121163393


In [47]:
import os 
import time 
import transformers 
import torch 
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score 
from transformers import pipeline, LlamaForCausalLM, LlamaTokenizer 
from nltk.translate.bleu_score import sentence_bleu

In [48]:
os.environ['HF_TOKEN'] = "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"
os.environ['HUGGINGFACEHUB_API_TOKEN'] = "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"

In [None]:
model_id = "meta-llama/Llama-3.2-3B-Instruct"
# Ensure the model uses the CPU
llama_pipeline = transformers.pipeline(
    "text-generation", 
    model=model_id, 
    model_kwargs={"torch_dtype": torch.float32, "device_map": "cpu"}, 
    max_new_tokens=20
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
def generate_report_with_llama(image_path, model, max_new_tokens=50):
    image = preprocess_image(image_path) 
    image = np.expand_dims(image, axis=0) 
    image_features = caption_model.predict(image) # Further truncate image features to manage input size

    truncated_features = image_features[0][:32] # Reduce size significantly 

    input_text = "Generate a medical report based on these features: " + str(truncated_features.tolist()) 
    messages = [ {"role": "system", "content": ""}, {"role": "user", "content": input_text} ] 
    clear_gpu_memory() # Clear GPU memory 
    
    start_time = time.time() 
    
    try: 
        outputs = model(messages, max_new_tokens=max_new_tokens, pad_token_id=model.tokenizer.eos_token_id) 
    except RuntimeError as e: 
        clear_gpu_memory() 
        print(f"RuntimeError: {e}") 
        return None 
        
    end_time = time.time() 
    
    report = outputs[0]["generated_text"] 
    clear_gpu_memory() # Clear GPU memory 
    print("Time taken: ", end_time - start_time, "\n") 
    return report

In [None]:
sample_images = test_df['image_path'].tolist()[:5]

In [None]:
clear_gpu_memory() # Clear GPU memory 

In [None]:
for image_path in sample_images:
    generated_report = generate_report_with_llama(image_path, llama_pipeline) 
    if generated_report: 
        print(f"Generated report for {image_path}: {generated_report}") # Display the image 
        image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE) 
        plt.imshow(image, cmap='gray') 
        plt.title('Generated Report: ' + generated_report) 
        plt.axis('off') 
        plt.show()