In [3]:
!pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.19.0-cp39-cp39-macosx_12_0_arm64.whl.metadata (4.0 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.2.2-py3-none-any.whl.metadata (2.6 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-1-py2.py3-none-macosx_11_0_arm64.whl.metadata (5.2 kB)
Collecting opt-einsum>=2.3.2 (from tensorflow)
  Downloading opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import random

from sklearn.metrics import confusion_matrix, classification_report
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from typing import Literal
from pathlib import Path
from PIL import Image
from tensorflow.keras.preprocessing.image import load_img, img_to_array

In [10]:
df_from_csv = pd.read_csv("/Users/theodoreutomo/cough-audio-predictions/CNNTrainTest/metadata_compiled_dummies.csv")
df_original = pd.read_csv("/Users/theodoreutomo/cough-audio-predictions/tabular_form/coughvid_v3.csv")

In [6]:
import os
import numpy as np
from tensorflow.keras.preprocessing.image import load_img, img_to_array

def load_images_with_filenames(folder_path: str,
                               target_size: tuple[int,int] = (128,128),
                               img_extensions: tuple[str,...] = ('.png','.jpg','.jpeg','.bmp','.gif')) \
                               -> np.ndarray:
    """
    Scans `folder_path` for image files, loads & resizes them, and returns
    a structured NumPy array with:
      - 'filename': the image file name (string)
      - 'image':    the image tensor (H x W x 3, dtype float32)
    """
    # 1) Gather valid file names
    filenames = [f for f in os.listdir(folder_path)
                 if f.lower().endswith(img_extensions)]
    N = len(filenames)
    H, W = target_size

    # Define a structured dtype: unicode filename + image array
    dtype = np.dtype([
        ('filename', f'U{max(len(f) for f in filenames)}'),
        ('image',     np.float32,      (H, W, 3))
    ])

    # Allocate the array and fill it
    data = np.empty(N, dtype=dtype)
    for i, fname in enumerate(filenames):
        img_path = os.path.join(folder_path, fname)
        img = load_img(img_path, target_size=target_size)   # PIL Image
        arr = img_to_array(img) / 255.0                     # normalize 0–1
        data[i] = (fname, arr)

    return data
                                   
folder = '/Users/theodoreutomo/cough-audio-predictions/YuanDataProcessing/all_images_combined'
dataset = load_images_with_filenames(folder, target_size=(64,64))

print(dataset[9]['filename'])         
print(dataset[9]['image'].shape)      


6ddcac11-f933-440a-ba0e-c6c379cc82b7.png
(64, 64, 3)


In [8]:
def save_image_dataset(data: np.ndarray, file_path: str):
    """
    Saves a structured NumPy array (with 'filename' & 'image' fields) to disk.
    - data: the structured array you got from load_images_with_filenames()
    - file_path: where to write, e.g. 'dataset.npy' or full/path/to/dataset.npy
    """
    # Ensures directory exists
    os.makedirs(os.path.dirname(file_path) or '.', exist_ok=True)
    # Save in NumPy's .npy format
    np.save(file_path, data)
    print(f"Saved {data.shape[0]} entries to {file_path!r}")

save_image_dataset(dataset, "/Users/theodoreutomo/cough-audio-predictions/YuanDataProcessing/images_dataset.npy")

Saved 22430 entries to '/Users/theodoreutomo/cough-audio-predictions/YuanDataProcessing/images_dataset.npy'


In [11]:
df_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34434 entries, 0 to 34433
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   datetime               34434 non-null  object 
 1   cough_detected         34434 non-null  float64
 2   latitude               19431 non-null  float64
 3   longitude              19431 non-null  float64
 4   age                    19396 non-null  float64
 5   gender                 20664 non-null  object 
 6   respiratory_condition  20664 non-null  object 
 7   fever_muscle_pain      20664 non-null  object 
 8   status                 20664 non-null  object 
 9   file_name              34434 non-null  object 
 10  audio_name             34434 non-null  object 
dtypes: float64(4), object(7)
memory usage: 2.9+ MB


In [12]:
df_original.head()

Unnamed: 0,datetime,cough_detected,latitude,longitude,age,gender,respiratory_condition,fever_muscle_pain,status,file_name,audio_name
0,2020-04-13T16:30:16.716367+00:00,0.9761,41.8,61.1,17.0,female,True,False,healthy,0f0762c8-4b9a-4c1e-a6c3-230f9d53de47.json,0f0762c8-4b9a-4c1e-a6c3-230f9d53de47.webm
1,2020-11-02T07:42:22.878931+00:00,0.0217,,,62.0,male,True,False,healthy,1ed36fa5-19bb-434b-b41f-7fa2b93763f1.json,1ed36fa5-19bb-434b-b41f-7fa2b93763f1.webm
2,2020-11-18T20:30:54.833009+00:00,0.9886,42.4,3.2,39.0,male,False,False,healthy,48b283df-8bda-4766-b02e-0efee9f7e59c.json,48b283df-8bda-4766-b02e-0efee9f7e59c.webm
3,2020-11-27T11:10:09.815282+00:00,0.8499,,,73.0,male,False,False,healthy,afd8d2b2-b4f1-49e0-ace8-29af9eb8f5e6.json,afd8d2b2-b4f1-49e0-ace8-29af9eb8f5e6.wav
4,2020-04-11T16:12:17.509769+00:00,0.7665,,,37.0,male,False,False,healthy,9e88a91b-9cc6-4376-b7c4-6fa5b3289592.json,9e88a91b-9cc6-4376-b7c4-6fa5b3289592.webm


In [18]:
def merge_image_array_with_df(
    df: pd.DataFrame,
    image_data: np.ndarray,
    df_filename_col: str = 'file_name',
    image_filename_field: str = 'filename',
    image_array_field: str = 'image',
    how: Literal['left','inner','right','outer'] = 'left'
) -> pd.DataFrame:
    """
    Merge a pandas DataFrame with a structured NumPy array of images,
    matching on filename stems (no extension) on both sides.
    """
    # 1) Copy df and create a 'stem' column by stripping extensions
    df2 = df.copy()
    df2['__stem'] = df2[df_filename_col].apply(lambda fn: Path(fn).stem)

    # 2) Build a DataFrame from your structured array and strip its extensions
    df_img = pd.DataFrame({
        image_filename_field: image_data[image_filename_field],
        image_array_field:     list(image_data[image_array_field])
    })
    df_img['__stem'] = df_img[image_filename_field].apply(lambda fn: Path(fn).stem)

    # 3) Merge on the stems
    merged = pd.merge(
        df2,
        df_img[['__stem', image_array_field]],
        on='__stem',
        how=how
    )

    # 4) Drop the helper column
    merged = merged.drop(columns=['__stem'])
    return merged

# --- usage ---
merged_df = merge_image_array_with_df(df_original, dataset)
print(merged_df.info())
print("Images attached:", merged_df['image'].notna().sum(), "out of", len(merged_df))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34434 entries, 0 to 34433
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   datetime               34434 non-null  object 
 1   cough_detected         34434 non-null  float64
 2   latitude               19431 non-null  float64
 3   longitude              19431 non-null  float64
 4   age                    19396 non-null  float64
 5   gender                 20664 non-null  object 
 6   respiratory_condition  20664 non-null  object 
 7   fever_muscle_pain      20664 non-null  object 
 8   status                 20664 non-null  object 
 9   file_name              34434 non-null  object 
 10  audio_name             34434 non-null  object 
 11  image                  22430 non-null  object 
dtypes: float64(4), object(8)
memory usage: 3.2+ MB
None
Images attached: 22430 out of 34434


In [20]:
clean_df = merged_df.dropna(subset=['image', 'status']).reset_index(drop=True)

print(f"Kept {len(clean_df)} rows with both an image and a status.")
print(clean_df.info())

Kept 20664 rows with both an image and a status.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20664 entries, 0 to 20663
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   datetime               20664 non-null  object 
 1   cough_detected         20664 non-null  float64
 2   latitude               11965 non-null  float64
 3   longitude              11965 non-null  float64
 4   age                    19396 non-null  float64
 5   gender                 20664 non-null  object 
 6   respiratory_condition  20664 non-null  object 
 7   fever_muscle_pain      20664 non-null  object 
 8   status                 20664 non-null  object 
 9   file_name              20664 non-null  object 
 10  audio_name             20664 non-null  object 
 11  image                  20664 non-null  object 
dtypes: float64(4), object(8)
memory usage: 1.9+ MB
None


In [21]:
clean_df.drop(columns=['latitude', 'longitude', 'datetime', 'age'], inplace=True)
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20664 entries, 0 to 20663
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   cough_detected         20664 non-null  float64
 1   gender                 20664 non-null  object 
 2   respiratory_condition  20664 non-null  object 
 3   fever_muscle_pain      20664 non-null  object 
 4   status                 20664 non-null  object 
 5   file_name              20664 non-null  object 
 6   audio_name             20664 non-null  object 
 7   image                  20664 non-null  object 
dtypes: float64(1), object(7)
memory usage: 1.3+ MB


In [22]:
clean_df.drop(columns=['file_name', 'audio_name'], inplace=True)
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20664 entries, 0 to 20663
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   cough_detected         20664 non-null  float64
 1   gender                 20664 non-null  object 
 2   respiratory_condition  20664 non-null  object 
 3   fever_muscle_pain      20664 non-null  object 
 4   status                 20664 non-null  object 
 5   image                  20664 non-null  object 
dtypes: float64(1), object(5)
memory usage: 968.8+ KB


In [26]:
clean_df_ohe = pd.get_dummies(
    clean_df,
    columns=['gender', 'respiratory_condition', 'fever_muscle_pain'],
    prefix=['gender', 'resp', 'fever'],
    drop_first=False  # set True if you want N-1 encoding
)

print(clean_df_ohe.info())
print(clean_df_ohe['status'].unique())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20664 entries, 0 to 20663
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   cough_detected  20664 non-null  float64
 1   status          20664 non-null  object 
 2   image           20664 non-null  object 
 3   gender_female   20664 non-null  bool   
 4   gender_male     20664 non-null  bool   
 5   gender_other    20664 non-null  bool   
 6   resp_False      20664 non-null  bool   
 7   resp_True       20664 non-null  bool   
 8   fever_False     20664 non-null  bool   
 9   fever_True      20664 non-null  bool   
dtypes: bool(7), float64(1), object(2)
memory usage: 625.7+ KB
None
['healthy' 'symptomatic' 'COVID-19']


In [31]:
clean_df_ohe = pd.get_dummies(
    clean_df,
    columns=['gender', 'respiratory_condition', 'fever_muscle_pain', 'status'],
    prefix=['gender', 'resp', 'fever', 'status'],
    drop_first=False
)

print(clean_df_ohe.head())

   cough_detected                                              image  \
0          0.9761  [[[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0,...   
1          0.0217  [[[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0,...   
2          0.9886  [[[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0,...   
3          0.8499  [[[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0,...   
4          0.7665  [[[1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0,...   

   gender_female  gender_male  gender_other  resp_False  resp_True  \
0           True        False         False       False       True   
1          False         True         False       False       True   
2          False         True         False        True      False   
3          False         True         False        True      False   
4          False         True         False        True      False   

   fever_False  fever_True  status_COVID-19  status_healthy  \
0         True       False            False            True   
1         True      

In [32]:
clean_df_ohe.to_pickle('/Users/theodoreutomo/cough-audio-predictions/tabular_form/clean_df_ohe.pkl')

In [33]:
# Compute class sizes via the one-hot columns
n_healthy     = clean_df_ohe['status_healthy'].sum()
n_symptomatic = clean_df_ohe['status_symptomatic'].sum()
n_covid       = clean_df_ohe['status_COVID-19'].sum()

min_n = int(min(n_healthy, n_symptomatic, n_covid))
print(f"Sampling {min_n} rows from each status class…")

# Sample from each group
df_healthy     = clean_df_ohe[clean_df_ohe['status_healthy']     == 1].sample(n=min_n, random_state=42)
df_symptomatic = clean_df_ohe[clean_df_ohe['status_symptomatic'] == 1].sample(n=min_n, random_state=42)
df_covid       = clean_df_ohe[clean_df_ohe['status_COVID-19']    == 1].sample(n=min_n, random_state=42)

# Concatenate, shuffle, reset index
balanced_df_ohe = pd.concat([df_healthy, df_symptomatic, df_covid]) \
                     .sample(frac=1, random_state=42) \
                     .reset_index(drop=True)

# Sanity check
counts = {
    'healthy':     balanced_df_ohe['status_healthy'].sum(),
    'symptomatic': balanced_df_ohe['status_symptomatic'].sum(),
    'COVID-19':    balanced_df_ohe['status_COVID-19'].sum(),
}
print("Balanced class counts:", counts)
print("Balanced DF shape:", balanced_df_ohe.shape)

Sampling 1315 rows from each status class…
Balanced class counts: {'healthy': 1315, 'symptomatic': 1315, 'COVID-19': 1315}
Balanced DF shape: (3945, 12)


In [34]:
balanced_df_ohe.to_pickle('/Users/theodoreutomo/cough-audio-predictions/tabular_form/balanced_df_ohe.pkl')