### importing libraries

In [1]:
import pandas as pd

### reading only required data

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/ieee8023/covid-chestxray-dataset/refs/heads/master/metadata.csv",usecols=["view","filename"])

### selecting required columns

In [3]:
df

Unnamed: 0,view,filename
0,PA,auntminnie-a-2020_01_28_23_51_6665_2020_01_28_...
1,PA,auntminnie-b-2020_01_28_23_51_6665_2020_01_28_...
2,PA,auntminnie-c-2020_01_28_23_51_6665_2020_01_28_...
3,PA,auntminnie-d-2020_01_28_23_51_6665_2020_01_28_...
4,PA,nejmc2001573_f1a.jpeg
...,...,...
945,AP,072ecaf8c60a81980abb57150a8016_jumbo-9.jpeg
946,AP,ff33c406392b968d483174c97eb857_jumbo-9.jpeg
947,PA,000001-266.jpg
948,AP,000001-272.jpg


### metadata info

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 950 entries, 0 to 949
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   view      950 non-null    object
 1   filename  950 non-null    object
dtypes: object(2)
memory usage: 15.0+ KB


### metadata description

In [5]:
df.describe()

Unnamed: 0,view,filename
count,950,950
unique,7,950
top,PA,auntminnie-a-2020_01_28_23_51_6665_2020_01_28_...
freq,344,1


### selecting only `AP` and `PA` value views 

In [6]:
final_df = df[df["view"].isin(["AP","PA"])]
final_df

Unnamed: 0,view,filename
0,PA,auntminnie-a-2020_01_28_23_51_6665_2020_01_28_...
1,PA,auntminnie-b-2020_01_28_23_51_6665_2020_01_28_...
2,PA,auntminnie-c-2020_01_28_23_51_6665_2020_01_28_...
3,PA,auntminnie-d-2020_01_28_23_51_6665_2020_01_28_...
4,PA,nejmc2001573_f1a.jpeg
...,...,...
943,AP,02b973e10caa192fd4e6825ad4aeaf_jumbo-10.jpeg
945,AP,072ecaf8c60a81980abb57150a8016_jumbo-9.jpeg
946,AP,ff33c406392b968d483174c97eb857_jumbo-9.jpeg
947,PA,000001-266.jpg


In [7]:
final_df.info()
final_df.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 547 entries, 0 to 948
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   view      547 non-null    object
 1   filename  547 non-null    object
dtypes: object(2)
memory usage: 12.8+ KB


Unnamed: 0,view,filename
count,547,547
unique,2,547
top,PA,auntminnie-a-2020_01_28_23_51_6665_2020_01_28_...
freq,344,1


In [8]:
ap = df[df["view"] == "AP"]
pa = df[df["view"] == "PA"]
ap_filename = ap["filename"]
pa_filename = pa["filename"]

In [9]:
print("AP dataframe")
ap.info()
print("\n")
print("PA dataframe")
pa.info()

AP dataframe
<class 'pandas.core.frame.DataFrame'>
Index: 203 entries, 9 to 948
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   view      203 non-null    object
 1   filename  203 non-null    object
dtypes: object(2)
memory usage: 4.8+ KB


PA dataframe
<class 'pandas.core.frame.DataFrame'>
Index: 344 entries, 0 to 947
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   view      344 non-null    object
 1   filename  344 non-null    object
dtypes: object(2)
memory usage: 8.1+ KB


### downloading images

In [10]:
import urllib
import os
from concurrent.futures import ThreadPoolExecutor
from functools import partial

In [11]:
url_base = "https://raw.githubusercontent.com/ieee8023/covid-chestxray-dataset/refs/heads/master/images/"


def downloadImg(imgName:str,imgType:str):
    fullUrl = f"{url_base}{imgName}"
    fileDest = f"./images/{imgType}/{imgName}"
    if os.path.exists(fileDest):
        return
    try:
        urllib.request.urlretrieve(fullUrl, fileDest)
    except Exception as e:
        return f"Error downloading {imgName}:{e}"

### making directories to save images

In [12]:
saveDirs = ["./images","./images/AP","./images/PA"]
for sd in saveDirs:
    os.makedirs(sd,exist_ok=True)


### parallelly downloading images

In [13]:
with ThreadPoolExecutor(max_workers=10) as executor:
    func_with_args = partial(downloadImg,imgType="AP")
    executor.map(func_with_args,ap_filename.tolist()[:10])

with ThreadPoolExecutor(max_workers=10) as executor:
    func_with_args = partial(downloadImg,imgType="PA")
    executor.map(func_with_args,pa_filename.tolist()[:10])

### Summary

In [14]:
print("Total images in original dataset :",df.filename.count())
print("No of images after preprocessing :",final_df.filename.count())
print("\nDistribution of views\n",final_df.view.value_counts())

Total images in original dataset : 950
No of images after preprocessing : 547

Distribution of views
 view
PA    344
AP    203
Name: count, dtype: int64


### CLAHE transforming images

In [15]:
import cv2
import numpy as np

In [18]:
def showCLAHEImage(image_path):
    image = cv2.imread(image_path)
    image_resized = cv2.resize(image, (500, 600))
    image_bw = cv2.cvtColor(image_resized, cv2.COLOR_BGR2GRAY)
    clahe = cv2.createCLAHE(clipLimit=10)
    clahe_img = np.clip(clahe.apply(image_bw) + 30, 0, 255).astype(np.uint8)
    _, threshold_img = cv2.threshold(image_bw, 180, 255, cv2.THRESH_BINARY)
    combined_view = np.hstack((threshold_img, clahe_img))
    cv2.imshow("Left: Threshold | Right: CLAHE", combined_view)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

In [21]:
for i in ap_filename.tolist()[:10]:
    showCLAHEImage("./images/AP/"+i)