<a href="https://colab.research.google.com/github/Saianiruth/cxr_foundation/blob/main/notebooks/quick_start_with_hugging_face.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# @title Authenticate with HuggingFace, skip if you have a HF_TOKEN secret

# Authenticate user for HuggingFace if needed. Enter token below if requested.
from huggingface_hub.utils import HfFolder
from huggingface_hub import notebook_login

if HfFolder.get_token() is None:
    from huggingface_hub import notebook_login
    notebook_login()

In [None]:
!mkdir ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [7]:
!kaggle datasets download -d ashery/chexpert --unzip -p data

Dataset URL: https://www.kaggle.com/datasets/ashery/chexpert
License(s): CC0-1.0
Downloading chexpert.zip to data
100% 10.7G/10.7G [02:16<00:00, 71.1MB/s]
100% 10.7G/10.7G [02:16<00:00, 84.4MB/s]


In [8]:
!rm data/train.csv
!rm data/valid.csv
!mv train.csv data/
!mv valid.csv data/

In [None]:
!git clone https://github.com/Google-Health/cxr-foundation.git

In [10]:
# Install dependencies
import tensorflow as tf
major_version = tf.__version__.rsplit(".", 1)[0]
!pip install tensorflow-text=={major_version} pypng && pip install --no-deps pydicom hcls_imaging_ml_toolkit retrying


Collecting tensorflow-text==2.18
  Downloading tensorflow_text-2.18.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
Collecting pypng
  Downloading pypng-0.20220715.0-py3-none-any.whl.metadata (13 kB)
Downloading tensorflow_text-2.18.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypng-0.20220715.0-py3-none-any.whl (58 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.1/58.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypng, tensorflow-text
  Attempting uninstall: tensorflow-text
    Found existing installation: tensorflow-text 2.18.1
    Uninstalling tensorflow-text-2.18.1:
      Successfully uninstalled tensorflow-text-2.18.1
Successfully installed pypng-0.20220715.0 tensorflow-text-2.18.0
Collecting pydicom
  Downloading pydicom-3.0.1-py3-no

In [11]:
import os
import sys
import numpy as np
import pandas as pd
import pydicom
from PIL import Image
import tensorflow as tf
import multiprocessing
# Repository path setup
REPO_PATH = os.path.abspath("cxr-foundation/python")
if REPO_PATH not in sys.path:
    sys.path.append(REPO_PATH)
from clientside.clients import make_hugging_face_client

# Enable resource variables to suppress TensorFlow warnings
tf.compat.v1.enable_resource_variables()


# Initialize CXR client
cxr_client = make_hugging_face_client('cxr_model')

# Load CSV files
df_train = pd.read_csv("data/train.csv")[["Path", "Pleural Effusion", "No Finding"]]
df_valid = pd.read_csv("data/valid.csv")[["Path", "Pleural Effusion", "No Finding"]]

# Ensure paths are formatted correctly
df_train["Path"] = df_train["Path"].apply(lambda x: os.path.join("data", x.split("data/")[-1]))
df_valid["Path"] = df_valid["Path"].apply(lambda x: os.path.join("data", x.split("data/")[-1]))

df_valid.head()

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

(…)xr-c-v2-pooled/variables/variables.index:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

variables.data-00000-of-00001:   0%|          | 0.00/724M [00:00<?, ?B/s]

fingerprint.pb:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

saved_model.pb:   0%|          | 0.00/9.17M [00:00<?, ?B/s]

saved_model.pb:   0%|          | 0.00/6.89M [00:00<?, ?B/s]

(…)x-elixr-b-text/variables/variables.index:   0%|          | 0.00/5.91k [00:00<?, ?B/s]

fingerprint.pb:   0%|          | 0.00/46.0 [00:00<?, ?B/s]

variables.data-00000-of-00001:   0%|          | 0.00/2.02G [00:00<?, ?B/s]



Unnamed: 0,Path,Pleural Effusion,No Finding
0,data/valid/patient64541/study1/view1_frontal.jpg,0,0
1,data/valid/patient64542/study1/view1_frontal.jpg,0,0
2,data/valid/patient64542/study1/view2_lateral.jpg,0,0
3,data/valid/patient64543/study1/view1_frontal.jpg,0,0
4,data/valid/patient64544/study1/view1_frontal.jpg,0,1


In [12]:
df_train.head(10)

Unnamed: 0,Path,Pleural Effusion,No Finding
0,data/train/patient00001/study1/view1_frontal.jpg,,1.0
1,data/train/patient00002/study2/view1_frontal.jpg,-1.0,
2,data/train/patient00002/study1/view1_frontal.jpg,,
3,data/train/patient00002/study1/view2_lateral.jpg,,
4,data/train/patient00003/study1/view1_frontal.jpg,,
5,data/train/patient00004/study1/view1_frontal.jpg,0.0,1.0
6,data/train/patient00004/study1/view2_lateral.jpg,0.0,1.0
7,data/train/patient00005/study1/view1_frontal.jpg,0.0,1.0
8,data/train/patient00005/study1/view2_lateral.jpg,0.0,1.0
9,data/train/patient00005/study2/view1_frontal.jpg,,


In [13]:
!ls data/train/patient00004/study1

view1_frontal.jpg  view2_lateral.jpg


In [14]:
# Function to clean "Pleural Effusion" column
def clean_effusion(df):
    df = df.copy()
    df["Pleural Effusion"] = df["Pleural Effusion"].replace("", None)
    df["Pleural Effusion"] = pd.to_numeric(df["Pleural Effusion"], errors='coerce')
    df = df[~((df["Pleural Effusion"].isna()) | (df["Pleural Effusion"] == -1)) | (df["No Finding"] == 1)]
    df.loc[df["No Finding"] == 1, "Pleural Effusion"] = 0
    df = df.drop(columns=["No Finding"])
    return df.reset_index(drop=True)  # Reinitialize index

# Clean train and valid datasets
df_train = clean_effusion(df_train)
df_valid = clean_effusion(df_valid)
df_valid.head(10)



Unnamed: 0,Path,Pleural Effusion
0,data/valid/patient64541/study1/view1_frontal.jpg,0
1,data/valid/patient64542/study1/view1_frontal.jpg,0
2,data/valid/patient64542/study1/view2_lateral.jpg,0
3,data/valid/patient64543/study1/view1_frontal.jpg,0
4,data/valid/patient64544/study1/view1_frontal.jpg,0
5,data/valid/patient64545/study1/view1_frontal.jpg,1
6,data/valid/patient64546/study1/view1_frontal.jpg,0
7,data/valid/patient64547/study1/view1_frontal.jpg,0
8,data/valid/patient64547/study1/view2_frontal.jpg,0
9,data/valid/patient64547/study1/view3_lateral.jpg,0


In [15]:
df_train.head(10)

Unnamed: 0,Path,Pleural Effusion
0,data/train/patient00001/study1/view1_frontal.jpg,0.0
1,data/train/patient00004/study1/view1_frontal.jpg,0.0
2,data/train/patient00004/study1/view2_lateral.jpg,0.0
3,data/train/patient00005/study1/view1_frontal.jpg,0.0
4,data/train/patient00005/study1/view2_lateral.jpg,0.0
5,data/train/patient00006/study1/view1_frontal.jpg,0.0
6,data/train/patient00007/study1/view1_frontal.jpg,0.0
7,data/train/patient00007/study2/view1_frontal.jpg,0.0
8,data/train/patient00008/study1/view1_frontal.jpg,1.0
9,data/train/patient00008/study2/view1_frontal.jpg,1.0


In [None]:
from tqdm import tqdm

from concurrent.futures import ProcessPoolExecutor
# Output CSV for embeddings
EMBEDDING_CSV_PATH = "pleural_effusion_embeddings.csv"

# Function to process a single image (using GPU)
def process_image(image_path):
    try:
        if image_path.lower().endswith('.dcm'):
            dicom_data = pydicom.dcmread(image_path)
            img = dicom_data.pixel_array
            img = Image.fromarray(img).convert('L')  # Convert to grayscale
        else:
            img = Image.open(image_path).convert('L')  # Grayscale

        img = img.resize((224, 224))  # Resize
        img_array = np.array(img) / 255.0  # Normalize
        return img_array, os.path.basename(image_path)
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

# Function to process and save embeddings (use GPU for inference)
def process_and_save_embeddings(df):
    with open(EMBEDDING_CSV_PATH, "w") as f:
        f.write("embedding,label,image_name\n")  # CSV header

    # Ensure GPU is used for inference
    with tf.device('/GPU:0'):
        # Use tqdm to show progress
        for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing images", ncols=100):
            result = process_image(row["Path"])
            if result:
                img_array, image_name = result
                pil_image = Image.fromarray((img_array * 255).astype(np.uint8))

                # Run embedding generation on GPU
                embedding = cxr_client.get_image_embeddings_from_images([pil_image])

                if hasattr(embedding[0], "general_img_emb"):
                    embedding_str = str(embedding[0].general_img_emb[0])
                    label = 1 if row["Pleural Effusion"] > 0 else 0

                    df_out = pd.DataFrame([[embedding_str, label, image_name]],
                                          columns=["embedding", "label", "image_name"])
                    df_out.to_csv(EMBEDDING_CSV_PATH, mode="a", header=False, index=False)

            if idx % 100 == 0:
                print(f"Processed {idx} images")

# Process train and validation datasets
process_and_save_embeddings(df_train)
process_and_save_embeddings(df_valid)

print(f"✅ Saved embeddings to '{EMBEDDING_CSV_PATH}'")

Processing images:   0%|                                     | 1/132514 [00:22<820:55:56, 22.30s/it]

Processed 0 images


Processing images:   0%|                                    | 101/132514 [03:24<67:48:09,  1.84s/it]

Processed 100 images


Processing images:   0%|                                    | 201/132514 [06:26<66:47:05,  1.82s/it]

Processed 200 images


Processing images:   0%|                                    | 301/132514 [09:28<67:33:53,  1.84s/it]

Processed 300 images


Processing images:   0%|                                    | 401/132514 [12:29<66:16:16,  1.81s/it]

Processed 400 images


Processing images:   0%|▏                                   | 501/132514 [15:31<66:49:02,  1.82s/it]

Processed 500 images


Processing images:   0%|▏                                   | 601/132514 [18:34<66:36:40,  1.82s/it]

Processed 600 images


Processing images:   1%|▏                                   | 701/132514 [21:35<66:18:17,  1.81s/it]

Processed 700 images


Processing images:   1%|▏                                   | 801/132514 [24:36<66:11:49,  1.81s/it]

Processed 800 images


Processing images:   1%|▏                                   | 901/132514 [27:38<65:57:57,  1.80s/it]

Processed 900 images


Processing images:   1%|▎                                  | 1001/132514 [30:38<65:46:35,  1.80s/it]

Processed 1000 images


Processing images:   1%|▎                                  | 1101/132514 [33:39<66:24:02,  1.82s/it]

Processed 1100 images


Processing images:   1%|▎                                  | 1201/132514 [36:39<65:49:17,  1.80s/it]

Processed 1200 images


Processing images:   1%|▎                                  | 1301/132514 [39:40<65:50:05,  1.81s/it]

Processed 1300 images


Processing images:   1%|▎                                  | 1401/132514 [42:41<66:05:49,  1.81s/it]

Processed 1400 images


Processing images:   1%|▍                                  | 1501/132514 [45:41<65:51:04,  1.81s/it]

Processed 1500 images


Processing images:   1%|▍                                  | 1601/132514 [48:42<66:21:04,  1.82s/it]

Processed 1600 images


Processing images:   1%|▍                                  | 1701/132514 [51:43<65:49:07,  1.81s/it]

Processed 1700 images


Processing images:   1%|▍                                  | 1801/132514 [54:44<65:30:20,  1.80s/it]

Processed 1800 images


Processing images:   1%|▌                                  | 1901/132514 [57:44<65:21:28,  1.80s/it]

Processed 1900 images


Processing images:   2%|▍                                | 2001/132514 [1:00:45<66:34:37,  1.84s/it]

Processed 2000 images


Processing images:   2%|▌                                | 2091/132514 [1:03:28<65:21:01,  1.80s/it]

In [12]:
# Function to process a single image (using GPU)
def process_image(image_path):
    try:
        if image_path.lower().endswith('.dcm'):
            dicom_data = pydicom.dcmread(image_path)
            img = dicom_data.pixel_array
            img = Image.fromarray(img).convert('L')  # Convert to grayscale
        else:
            img = Image.open(image_path).convert('L')  # Grayscale

        img = img.resize((224, 224))  # Resize
        img_array = np.array(img) / 255.0  # Normalize
        return img_array, os.path.basename(image_path)
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

# Function to process and save embeddings (use GPU for inference)
def process_and_save_embeddings(df, output_path):
    with open(output_path, "w") as f:
        f.write("embedding,label,image_name\n")  # CSV header

    with tf.device('/GPU:0'):
        for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing images", ncols=100):
            result = process_image(row["Path"])
            if result:
                img_array, image_name = result
                pil_image = Image.fromarray((img_array * 255).astype(np.uint8))

                # Run embedding generation on GPU
                embedding = cxr_client.get_image_embeddings_from_images([pil_image])

                if hasattr(embedding[0], "general_img_emb"):
                    embedding_str = str(embedding[0].general_img_emb[0])
                    label = 1 if row["Pleural Effusion"] > 0 else 0

                    df_out = pd.DataFrame([[embedding_str, label, image_name]],
                                          columns=["embedding", "label", "image_name"])
                    df_out.to_csv(output_path, mode="a", header=False, index=False)

            if idx % 100 == 0:
                print(f"Processed {idx} images")

# Splitting the dataframes
df_train_part1 = df_train.iloc[:len(df_train)//2]
df_train_part2 = df_train.iloc[len(df_train)//2:]

# Running both splits simultaneously
with ProcessPoolExecutor(max_workers=2) as executor:
    executor.submit(process_and_save_embeddings, df_train_part1, "part1_embeddings.csv")
    executor.submit(process_and_save_embeddings, df_train_part2, "part2_embeddings.csv")

# Merging the outputs
final_df = pd.concat([
    pd.read_csv("part1_embeddings.csv"),
    pd.read_csv("part2_embeddings.csv")
])
final_df.to_csv(EMBEDDING_CSV_PATH, index=False)

print(f"✅ Saved embeddings to '{EMBEDDING_CSV_PATH}'")
