<a href="https://colab.research.google.com/github/Saianiruth/cxr_foundation/blob/main/notebooks/quick_start_with_hugging_face.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

~~~
Copyright 2024 Google LLC

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
~~~
<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/google-health/cxr-foundation/blob/master/notebooks/quick_start_with_hugging_face.ipynb">
      <img alt="Google Colab logo" src="https://www.tensorflow.org/images/colab_logo_32px.png" width="32px"><br> Run in Google Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/google-health/cxr-foundation/blob/master/notebooks/quick_start_with_hugging_face.ipynb">
      <img alt="GitHub logo" src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" width="32px"><br> View on GitHub
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://huggingface.co/google/cxr-foundation">
      <img alt="HuggingFace logo" src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" width="32px"><br> View on HuggingFace
    </a>
  </td>
</tr></tbody></table>

# Quick start with Hugging Face
This Colab notebook provides a basic demo of using Chest X-ray (CXR) Foundation. CXR Foundation is an embeddings models that generates a machine learning representations known as embeddings, from chest X-ray images and/or chest X-ray related text. These embeddings can be used to develop custom models for CXR use-cases with less data and compute compared to traditional model development methods. Learn more about embeddings and their benefits at this [page](https://developers.google.com/health-ai-developer-foundations/cxr-foundation).

In [1]:
# @title Authenticate with HuggingFace, skip if you have a HF_TOKEN secret

# Authenticate user for HuggingFace if needed. Enter token below if requested.
from huggingface_hub.utils import HfFolder
from huggingface_hub import notebook_login

if HfFolder.get_token() is None:
    from huggingface_hub import notebook_login
    notebook_login()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [28]:
# Install dependencies
major_version = tf.__version__.rsplit(".", 1)[0]
!pip install tensorflow-text=={major_version} pypng && pip install --no-deps pydicom hcls_imaging_ml_toolkit retrying


Collecting tensorflow-text==2.18
  Downloading tensorflow_text-2.18.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
Downloading tensorflow_text-2.18.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorflow-text
  Attempting uninstall: tensorflow-text
    Found existing installation: tensorflow-text 2.18.1
    Uninstalling tensorflow-text-2.18.1:
      Successfully uninstalled tensorflow-text-2.18.1
Successfully installed tensorflow-text-2.18.0
Collecting pydicom
  Downloading pydicom-3.0.1-py3-none-any.whl.metadata (9.4 kB)
Collecting hcls_imaging_ml_toolkit
  Downloading hcls_imaging_ml_toolkit-0.1-py3-none-any.whl.metadata (815 bytes)
Collecting retrying
  Downloading retrying-1.3.4-py3-none-any.whl.metadata (6.9 kB)
Downloading pydicom-3.0.1-py3-none-any.whl (2.4 MB)
[2K   [9

In [2]:
import os
import sys
import numpy as np
import pandas as pd
import pydicom
from PIL import Image
import tensorflow as tf
# Repository path setup
REPO_PATH = os.path.abspath("cxr-foundation/python")
if REPO_PATH not in sys.path:
    sys.path.append(REPO_PATH)
from clientside.clients import make_hugging_face_client

# Enable resource variables to suppress TensorFlow warnings
tf.compat.v1.enable_resource_variables()


# Initialize CXR client
cxr_client = make_hugging_face_client('cxr_model')

# Load CSV files
df_train = pd.read_csv("data/train.csv")[["Path", "Pleural Effusion", "No Finding"]]
df_valid = pd.read_csv("data/valid.csv")[["Path", "Pleural Effusion", "No Finding"]]

# Ensure paths are formatted correctly
df_train["Path"] = df_train["Path"].apply(lambda x: os.path.join("data", x.split("data/")[-1]))
df_valid["Path"] = df_valid["Path"].apply(lambda x: os.path.join("data", x.split("data/")[-1]))

df_valid.head()

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]



Unnamed: 0,Path,Pleural Effusion,No Finding
0,data/valid/patient64541/study1/view1_frontal.jpg,0,0
1,data/valid/patient64542/study1/view1_frontal.jpg,0,0
2,data/valid/patient64542/study1/view2_lateral.jpg,0,0
3,data/valid/patient64543/study1/view1_frontal.jpg,0,0
4,data/valid/patient64544/study1/view1_frontal.jpg,0,1


In [10]:
df_train.head(10)

Unnamed: 0,Path,Pleural Effusion,No Finding
0,data/train/patient00001/study1/view1_frontal.jpg,,1.0
1,data/train/patient00002/study2/view1_frontal.jpg,-1.0,
2,data/train/patient00002/study1/view1_frontal.jpg,,
3,data/train/patient00002/study1/view2_lateral.jpg,,
4,data/train/patient00003/study1/view1_frontal.jpg,,
5,data/train/patient00004/study1/view1_frontal.jpg,0.0,1.0
6,data/train/patient00004/study1/view2_lateral.jpg,0.0,1.0
7,data/train/patient00005/study1/view1_frontal.jpg,0.0,1.0
8,data/train/patient00005/study1/view2_lateral.jpg,0.0,1.0
9,data/train/patient00005/study2/view1_frontal.jpg,,


In [11]:
!ls data/train/patient00004/study1

view1_frontal.jpg  view2_lateral.jpg


In [13]:
# Function to clean "Pleural Effusion" column
def clean_effusion(df):
    df = df.copy()
    df["Pleural Effusion"] = df["Pleural Effusion"].replace("", None)
    df["Pleural Effusion"] = pd.to_numeric(df["Pleural Effusion"], errors='coerce')
    df = df[~((df["Pleural Effusion"].isna()) | (df["Pleural Effusion"] == -1)) | (df["No Finding"] == 1)]
    df.loc[df["No Finding"] == 1, "Pleural Effusion"] = 0
    df = df.drop(columns=["No Finding"])
    return df.reset_index(drop=True)  # Reinitialize index

# Clean train and valid datasets
df_train = clean_effusion(df_train)
df_valid = clean_effusion(df_valid)
df_valid.head(10)



KeyError: 'No Finding'

In [8]:
df_train.head(10)

Unnamed: 0,Path,Pleural Effusion
0,data/train/patient00001/study1/view1_frontal.jpg,0.0
5,data/train/patient00004/study1/view1_frontal.jpg,0.0
6,data/train/patient00004/study1/view2_lateral.jpg,0.0
7,data/train/patient00005/study1/view1_frontal.jpg,0.0
8,data/train/patient00005/study1/view2_lateral.jpg,0.0
11,data/train/patient00006/study1/view1_frontal.jpg,0.0
12,data/train/patient00007/study1/view1_frontal.jpg,0.0
13,data/train/patient00007/study2/view1_frontal.jpg,0.0
14,data/train/patient00008/study1/view1_frontal.jpg,1.0
15,data/train/patient00008/study2/view1_frontal.jpg,1.0


In [None]:
# Output CSV for embeddings
EMBEDDING_CSV_PATH = "pleural_effusion_embeddings.csv"

# Function to process a single image
def process_image(image_path):
    try:
        if image_path.lower().endswith('.dcm'):
            dicom_data = pydicom.dcmread(image_path)
            img = dicom_data.pixel_array
            img = Image.fromarray(img).convert('L')  # Convert to grayscale
        else:
            img = Image.open(image_path).convert('L')  # Grayscale

        img = img.resize((224, 224))  # Resize
        img_array = np.array(img) / 255.0  # Normalize
        return img_array, os.path.basename(image_path)
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

# Function to process and save embeddings
def process_and_save_embeddings(df):
    with open(EMBEDDING_CSV_PATH, "w") as f:
        f.write("embedding,label,image_name\n")  # CSV header

    for idx, row in df.iterrows():
        result = process_image(row["Path"])
        if result:
            img_array, image_name = result
            pil_image = Image.fromarray((img_array * 255).astype(np.uint8))

            embedding = cxr_client.get_image_embeddings_from_images([pil_image])

            if hasattr(embedding[0], "general_img_emb"):
                embedding_str = str(embedding[0].general_img_emb[0])
                label = 1 if row["Pleural Effusion"] > 0 else 0

                df_out = pd.DataFrame([[embedding_str, label, image_name]],
                                      columns=["embedding", "label", "image_name"])
                df_out.to_csv(EMBEDDING_CSV_PATH, mode="a", header=False, index=False)

        if idx % 100 == 0:
            print(f"Processed {idx} images")

# Process train and validation datasets
process_and_save_embeddings(df_train)
process_and_save_embeddings(df_valid)

print(f"✅ Saved embeddings to '{EMBEDDING_CSV_PATH}'")
