In [1]:
############# Code to Train Models Combining GoPro and IMAGINE V2 Data #############

# This was done to try and improve the generalizability of the IMAGINE models

In [47]:
# Check that path to cudnn is correct
import os
print(os.environ['LD_LIBRARY_PATH'])
!echo $LD_LIBRARY_PATH

/usr/lib/R/lib:/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/default-java/lib/server
/usr/lib/R/lib:/lib:/usr/lib/x86_64-linux-gnu:/usr/lib/jvm/default-java/lib/server


In [23]:
# Import modules
import tensorflow as tf
from tensorflow import keras as K
from keras.models import load_model
import pandas as pd
import numpy as np
import PIL
import logging
import os

import zipfile
import shutil

# for figures
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
#import seaborn as sns

# for the Grad-CAMs
from IPython.display import Image, display
import matplotlib.cm as cm

#Working Directory on A4 Computer (GoPro and V2 data are on the A4 Computer)

files = os.listdir()
    
os.chdir("/home/scottweichenthal/Dropbox/IMAGINE Project/MSSI_Project/")

#This code allows GPU memory allocation to grow as needed
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
    

In [24]:
#  Prints the Tensorflow, Keras versions, and number of GPUs
print(tf.__version__)
print(K.__version__)
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

2.7.0
2.7.0
Num GPUs Available:  4


In [25]:
#Which Python Version
from platform import python_version

print(python_version())

3.9.7


In [7]:
#To clear GPU memory if needed (in RStudio Terminal)
#nvidia-smi
#sudo kill -9 PID # PID is the device id to clear, need to look at terminal in RStudio


In [26]:
# Read metadata for compiled data you want to use for CNN model development
#This version has GoPro in trn and val sets
#metadata = pd.read_csv("compiled data/metadata_goproUFP_V2_V3_15012024.csv", low_memory=False)

#This version has GoPro data only in trn set (as opposed to random split between trn,val,tst)
metadata = pd.read_csv("compiled data/metadata_goproUFP_V2_V3_04042024.csv", low_memory=False)
# Train: 100% GoPro + 80% Imagine V2
# Val: 10% Imagine V2
# Tst: 10% Imagine V2
# External Test: 100% Imagine V3


#This compiled file incorporate all existing data with GoPro in training set only. Rishabh Please use this file!

#metadata = pd.read_csv("compiled data/metadata_goproUFP_V2_V3_ .csv", low_memory=False)
# Train: 100% GoPro + 80% Imagine V2 and V3
# Val: 10% Imagine V2 and V3
# Tst: 10% Imagine V2 and V3




#There is another compiled file called "metadata_goproNOISE_V2_V3_04252024.csv". 
#This file contains additional noise measurements but UFP measurements are missing for these additional Noise measurements.
# We can retrain the noise models if necessary using this additional data but they work well already.

In [7]:
# Look at data
metadata.head(10)

Unnamed: 0,datetime,image_path,audio_path,ln_ufp_num_10s_ma_image_label_raw,ln_ufp_num_10s_ma_spec_label_raw,ln_ufp_size_10s_ma_image_label_raw,ln_ufp_size_10s_ma_spec_label_raw,ln_noise_10s_ma_image_label_raw,ln_noise_10s_ma_spec_label_raw,ufp_num_10s_ma_image_label_raw,...,ufp_num_10s_ma_spec_label_quartile,ufp_num_10s_ma_image_label_quartile,ufp_size_10s_ma_spec_label_quartile,ufp_size_10s_ma_image_label_quartile,image_extension,audio_extension,image_name,audio_name,pair_pm25,set
0,2019-04-23T10:20:27Z,archived files no longer used/GoPro Model/data...,archived files no longer used/GoPro Model/data...,9.744668,9.744668,3.09874,3.09874,4.319752,4.319752,17063.0,...,,,,,,,,,,trn
1,2019-04-23T10:20:28Z,archived files no longer used/GoPro Model/data...,archived files no longer used/GoPro Model/data...,9.710145,9.710145,3.119276,3.119276,4.309322,4.309322,16484.0,...,,,,,,,,,,trn
2,2019-04-23T10:20:29Z,archived files no longer used/GoPro Model/data...,archived files no longer used/GoPro Model/data...,9.683153,9.683153,3.144583,3.144583,4.327702,4.327702,16045.0,...,,,,,,,,,,trn
3,2019-04-23T10:20:30Z,archived files no longer used/GoPro Model/data...,archived files no longer used/GoPro Model/data...,9.583558,9.583558,3.167583,3.167583,4.320018,4.320018,14524.0,...,,,,,,,,,,trn
4,2019-04-23T10:20:31Z,archived files no longer used/GoPro Model/data...,archived files no longer used/GoPro Model/data...,9.509259,9.509259,3.184698,3.184698,4.303119,4.303119,13484.0,...,,,,,,,,,,trn
5,2019-04-23T10:20:32Z,archived files no longer used/GoPro Model/data...,archived files no longer used/GoPro Model/data...,9.479069,9.479069,3.197448,3.197448,4.27736,4.27736,13083.0,...,,,,,,,,,,trn
6,2019-04-23T10:20:33Z,archived files no longer used/GoPro Model/data...,archived files no longer used/GoPro Model/data...,9.422544,9.422544,3.21165,3.21165,4.273606,4.273606,12364.0,...,,,,,,,,,,trn
7,2019-04-23T10:20:34Z,archived files no longer used/GoPro Model/data...,archived files no longer used/GoPro Model/data...,9.399638,9.399638,3.223664,3.223664,4.26,4.26,12084.0,...,,,,,,,,,,trn
8,2019-04-23T10:20:35Z,archived files no longer used/GoPro Model/data...,archived files no longer used/GoPro Model/data...,9.403107,9.403107,3.235536,3.235536,4.227709,4.227709,12126.0,...,,,,,,,,,,trn
9,2019-04-23T10:20:36Z,archived files no longer used/GoPro Model/data...,archived files no longer used/GoPro Model/data...,9.406976,9.406976,3.241029,3.241029,4.200804,4.200804,12173.0,...,,,,,,,,,,trn


In [27]:
#Look at variable names
for col in metadata.columns:
    print(col)

datetime
image_path
audio_path
ln_ufp_num_10s_ma_image_label_raw
ln_ufp_num_10s_ma_spec_label_raw
ln_ufp_size_10s_ma_image_label_raw
ln_ufp_size_10s_ma_spec_label_raw
ln_noise_10s_ma_image_label_raw
ln_noise_10s_ma_spec_label_raw
ufp_num_10s_ma_image_label_raw
ufp_num_10s_ma_spec_label_raw
ufp_size_10s_ma_image_label_raw
ufp_size_10s_ma_spec_label_raw
noise_10s_ma_image_label_raw
noise_10s_ma_spec_label_raw
temp_airp
wspd_airp
year
city
pm25
file_exists
hardware
site_id
device_id
file
noise_10s_ma_spec_label_quartile
noise_10s_ma_image_label_quartile
ufp_num_10s_ma_spec_label_quartile
ufp_num_10s_ma_image_label_quartile
ufp_size_10s_ma_spec_label_quartile
ufp_size_10s_ma_image_label_quartile
image_extension
audio_extension
image_name
audio_name
pair_pm25
set


In [28]:
# Look at data types
metadata.dtypes

datetime                                 object
image_path                               object
audio_path                               object
ln_ufp_num_10s_ma_image_label_raw       float64
ln_ufp_num_10s_ma_spec_label_raw        float64
ln_ufp_size_10s_ma_image_label_raw      float64
ln_ufp_size_10s_ma_spec_label_raw       float64
ln_noise_10s_ma_image_label_raw         float64
ln_noise_10s_ma_spec_label_raw          float64
ufp_num_10s_ma_image_label_raw          float64
ufp_num_10s_ma_spec_label_raw           float64
ufp_size_10s_ma_image_label_raw         float64
ufp_size_10s_ma_spec_label_raw          float64
noise_10s_ma_image_label_raw            float64
noise_10s_ma_spec_label_raw             float64
temp_airp                               float64
wspd_airp                                 int64
year                                      int64
city                                     object
pm25                                    float64
file_exists                             

In [29]:
# How much data in trn, val, tst sets for metadata_random_split

metadata.set.value_counts()

trn        327094
ext_tst     23233
val         20073
tst         19910
Name: set, dtype: int64

In [30]:
#Select input file type (this tells python where to look for the file paths)
file = input("What input are you using: images, or spectrograms? ")

if file == 'images':
    file = 'image_path'
else:
  if file == 'spectrograms':
      file = 'audio_path'
  else:
    print('!!!TYPO in input_data name')


What input are you using: images, or spectrograms? images


In [31]:
# Define Initial Learning Rate
initial_learning_rate = 1e-4
meta_data_name = "GoPro_Training_Only"

In [32]:
# Select Model Architecture/Optimizer you want to use - STARTS FROM IMAGENET WEIGHTS-

architecture = input("What architecture do you want to use: Xception_linear_Nadam, ResNet50_linear_Nadam ?")
if architecture == 'Xception_linear_Nadam':
    architecture_preprocessing = K.applications.xception.preprocess_input
    def get_compiled_model():
        model_input = K.layers.Input(shape=(256, 256, 3), dtype='float32', name='input')
        conv_base = K.applications.Xception(include_top=False, weights = "imagenet", input_tensor=model_input)
        model_output = K.layers.GlobalAveragePooling2D()(conv_base.output)
        model_output = K.layers.Dense(units= 1, activation='linear')(model_output) 
        model = K.models.Model(inputs=model_input, outputs=model_output)
        model.compile(
            optimizer=K.optimizers.Nadam(learning_rate = initial_learning_rate),
            loss = 'mse',
            metrics = ['mae']
        )
        return model
                    
elif architecture == 'ResNet50_linear_Nadam':
    architecture_preprocessing = K.applications.resnet50.preprocess_input
    def get_compiled_model():
        model_input = K.layers.Input(shape=(256, 256, 3), dtype='float32', name='input')
        conv_base = K.applications.ResNet50(include_top=False, weights= "imagenet", input_tensor=model_input)
        model_output = K.layers.GlobalAveragePooling2D()(conv_base.output)
        model_output = K.layers.Dense(units= 1, activation='linear')(model_output) 
        model = K.models.Model(inputs=model_input, outputs=model_output)
        model.compile(
            optimizer=K.optimizers.Nadam(learning_rate = initial_learning_rate),
            loss = 'mse',
            metrics = ['mae']
        )
        return model
                      
else:
    print('!!!TYPO in architecture name')



What architecture do you want to use: Xception_linear_Nadam, ResNet50_linear_Nadam ?ResNet50_linear_Nadam


In [33]:
# Select Exposure to be modelled

target = input("What do you want to model: ln_noise_10s_ma_spec_label_raw, ln_noise_10s_ma_image_label_raw, ln_ufp_num_10s_ma_spec_label_raw, ln_ufp_num_10s_ma_image_label_raw, ln_ufp_size_10s_ma_image_label_raw, ln_ufp_size_10s_ma_spec_label_raw ?")                  

if target == 'ln_noise_10s_ma_image_label_raw':

    generator = K.preprocessing.image.ImageDataGenerator(preprocessing_function=architecture_preprocessing, 
                                                     horizontal_flip=True,
                                                     vertical_flip = False)
    train_generator = generator.flow_from_dataframe(dataframe=metadata.loc[metadata['set']=='trn', [target, file]].reset_index(drop=True),  
                                                x_col= file, # 
                                                y_col= target,  # 
                                                class_mode = 'raw', 
                                                target_size=(256, 256), # all of our images will be resized to 256 x 256
                                                color_mode='rgb', 
                                                batch_size=64,
                                                shuffle=True)
    validate_generator = generator.flow_from_dataframe(dataframe=metadata.loc[metadata['set']=='val', [target, file]].reset_index(drop=True),
                                                     x_col= file,
                                                     y_col=target,
                                                     class_mode='raw',
                                                     target_size=(256, 256),
                                                     color_mode='rgb',
                                                     batch_size=64,
                                                     shuffle=False)
    
    # Define callbacks
    csv_logger = K.callbacks.CSVLogger('./model_development/logs/'+target+','+architecture+','+file+','+meta_data_name+'_Combined_IMAGINE_v2_GoPro.csv')    
    model_checkpoint = K.callbacks.ModelCheckpoint('./model_development/models/'+target+','+architecture+','+file+','+meta_data_name+'_Combined_IMAGINE_v2_GoPro.hdf5', monitor='val_loss', mode='auto', save_weights_only=False,save_best_only=True)
    reduce_lr_on_plateau = K.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, mode='auto', verbose=1)


elif target == 'ln_noise_10s_ma_spec_label_raw':

    generator = K.preprocessing.image.ImageDataGenerator(preprocessing_function=architecture_preprocessing, 
                                                     horizontal_flip=True,
                                                     vertical_flip = False)
    train_generator = generator.flow_from_dataframe(dataframe=metadata.loc[metadata['set']=='trn', [target, file]].reset_index(drop=True),  
                                                x_col= file, # 
                                                y_col= target,  # 
                                                class_mode = 'raw', 
                                                target_size=(256, 256), # all of our images will be resized to 256 x 256
                                                color_mode='rgb', 
                                                batch_size=64,
                                                shuffle=True)
    validate_generator = generator.flow_from_dataframe(dataframe=metadata.loc[metadata['set']=='val', [target, file]].reset_index(drop=True),
                                                     x_col= file,
                                                     y_col=target,
                                                     class_mode='raw',
                                                     target_size=(256, 256),
                                                     color_mode='rgb',
                                                     batch_size=64,
                                                     shuffle=False)

    # Define callbacks
    csv_logger = K.callbacks.CSVLogger('./model_development/logs/'+target+','+architecture+','+file+','+meta_data_name+'_Combined_IMAGINE_v2_GoPro.csv')    
    model_checkpoint = K.callbacks.ModelCheckpoint('./model_development/models/'+target+','+architecture+','+file+','+meta_data_name+'_Combined_IMAGINE_v2_GoPro.hdf5', monitor='val_loss', mode='auto', save_weights_only=False,save_best_only=True)
    reduce_lr_on_plateau = K.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, mode='auto', verbose=1)
    
elif target == 'ln_ufp_num_10s_ma_image_label_raw':

    generator = K.preprocessing.image.ImageDataGenerator(preprocessing_function=architecture_preprocessing, 
                                                     horizontal_flip=True,
                                                     vertical_flip = False)
    train_generator = generator.flow_from_dataframe(dataframe=metadata.loc[metadata['set']=='trn', [target, file]].reset_index(drop=True),  
                                                x_col= file, # 
                                                y_col= target,  # 
                                                class_mode = 'raw', 
                                                target_size=(256, 256), # all of our images will be resized to 256 x 256
                                                color_mode='rgb', 
                                                batch_size=64,
                                                shuffle=True)
    validate_generator = generator.flow_from_dataframe(dataframe=metadata.loc[metadata['set']=='val', [target, file]].reset_index(drop=True),
                                                     x_col= file,
                                                     y_col=target,
                                                     class_mode='raw',
                                                     target_size=(256, 256),
                                                     color_mode='rgb',
                                                     batch_size=64,
                                                     shuffle=False)

    
    # Define callbacks
    csv_logger = K.callbacks.CSVLogger('./model_development/logs/'+target+','+architecture+','+file+','+meta_data_name+'_Combined_IMAGINE_v2_GoPro.csv')    
    model_checkpoint = K.callbacks.ModelCheckpoint('./model_development/models/'+target+','+architecture+','+file+','+meta_data_name+'_Combined_IMAGINE_v2_GoPro.hdf5', monitor='val_loss', mode='auto', save_weights_only=False,save_best_only=True)
    reduce_lr_on_plateau = K.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, mode='auto', verbose=1)


elif target == 'ln_ufp_num_10s_ma_spec_label_raw':

    generator = K.preprocessing.image.ImageDataGenerator(preprocessing_function=architecture_preprocessing, 
                                                     horizontal_flip=True,
                                                     vertical_flip = False)
    train_generator = generator.flow_from_dataframe(dataframe=metadata.loc[metadata['set']=='trn', [target, file]].reset_index(drop=True),  
                                                x_col= file, # 
                                                y_col= target,  # 
                                                class_mode = 'raw', 
                                                target_size=(256, 256), # all of our images will be resized to 256 x 256
                                                color_mode='rgb', 
                                                batch_size=64,
                                                shuffle=True)
    validate_generator = generator.flow_from_dataframe(dataframe=metadata.loc[metadata['set']=='val', [target, file]].reset_index(drop=True),
                                                     x_col= file,
                                                     y_col=target,
                                                     class_mode='raw',
                                                     target_size=(256, 256),
                                                     color_mode='rgb',
                                                     batch_size=64,
                                                     shuffle=False)

    # Define callbacks
    csv_logger = K.callbacks.CSVLogger('./model_development/logs/'+target+','+architecture+','+file+','+meta_data_name+'_Combined_IMAGINE_v2_GoPro.csv')    
    model_checkpoint = K.callbacks.ModelCheckpoint('./model_development/models/'+target+','+architecture+','+file+','+meta_data_name+'_Combined_IMAGINE_v2_GoPro.hdf5', monitor='val_loss', mode='auto', save_weights_only=False,save_best_only=True)
    reduce_lr_on_plateau = K.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, mode='auto', verbose=1)
    
elif target == 'ln_ufp_size_10s_ma_image_label_raw':

    generator = K.preprocessing.image.ImageDataGenerator(preprocessing_function=architecture_preprocessing, 
                                                     horizontal_flip=True,
                                                     vertical_flip = False)
    train_generator = generator.flow_from_dataframe(dataframe=metadata.loc[metadata['set']=='trn', [target, file]].reset_index(drop=True),  
                                                x_col= file, # 
                                                y_col= target,  # 
                                                class_mode = 'raw', 
                                                target_size=(256, 256), # all of our images will be resized to 256 x 256
                                                color_mode='rgb', 
                                                batch_size=64,
                                                shuffle=True)
    validate_generator = generator.flow_from_dataframe(dataframe=metadata.loc[metadata['set']=='val', [target, file]].reset_index(drop=True),
                                                     x_col= file,
                                                     y_col=target,
                                                     class_mode='raw',
                                                     target_size=(256, 256),
                                                     color_mode='rgb',
                                                     batch_size=64,
                                                     shuffle=False)
 
    
    # Define callbacks
    csv_logger = K.callbacks.CSVLogger('./model_development/logs/'+target+','+architecture+','+file+','+meta_data_name+'_Combined_IMAGINE_v2_GoPro.csv')
    model_checkpoint = K.callbacks.ModelCheckpoint('./model_development/models/'+target+','+architecture+','+file+','+meta_data_name+'_Combined_IMAGINE_v2_GoPro.hdf5', monitor='val_loss', mode='auto', save_weights_only=False,save_best_only=True)
    reduce_lr_on_plateau = K.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, mode='auto', verbose=1)

elif target == 'ln_ufp_size_10s_ma_spec_label_raw':

    generator = K.preprocessing.image.ImageDataGenerator(preprocessing_function=architecture_preprocessing, 
                                                     horizontal_flip=True,
                                                     vertical_flip = False)
    train_generator = generator.flow_from_dataframe(dataframe=metadata.loc[metadata['set']=='trn', [target, file]].reset_index(drop=True),  
                                                x_col= file, # 
                                                y_col= target,  # 
                                                class_mode = 'raw', 
                                                target_size=(256, 256), # all of our images will be resized to 256 x 256
                                                color_mode='rgb', 
                                                batch_size=64,
                                                shuffle=True)
    validate_generator = generator.flow_from_dataframe(dataframe=metadata.loc[metadata['set']=='val', [target, file]].reset_index(drop=True),
                                                     x_col= file,
                                                     y_col=target,
                                                     class_mode='raw',
                                                     target_size=(256, 256),
                                                     color_mode='rgb',
                                                     batch_size=64,
                                                     shuffle=False)

    # Define callbacks
    csv_logger = K.callbacks.CSVLogger('./model_development/logs/'+target+','+architecture+','+file+','+meta_data_name+'_Combined_IMAGINE_v2_GoPro.csv')
    model_checkpoint = K.callbacks.ModelCheckpoint('./model_development/models/'+target+','+architecture+','+file+','+meta_data_name+'_Combined_IMAGINE_v2_GoPro.hdf5', monitor='val_loss', mode='auto', save_weights_only=False,save_best_only=True)
    reduce_lr_on_plateau = K.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, mode='auto', verbose=1)

else:
    print('!!!TYPO in exposure name')  

What do you want to model: ln_noise_10s_ma_spec_label_raw, ln_noise_10s_ma_image_label_raw, ln_ufp_num_10s_ma_spec_label_raw, ln_ufp_num_10s_ma_image_label_raw, ln_ufp_size_10s_ma_image_label_raw, ln_ufp_size_10s_ma_spec_label_raw ?ln_ufp_size_10s_ma_image_label_raw
Found 327094 validated image filenames.
Found 20073 validated image filenames.


In [34]:
#Train the model

# Create a MirroredStrategy.
strategy = tf.distribute.MirroredStrategy()

# Open a strategy scope.
with strategy.scope():
    model = get_compiled_model()
    

#Fit Model
model.fit(train_generator,
          validation_data=validate_generator,
          epochs=50, 
          steps_per_epoch=int(np.ceil(train_generator.samples/train_generator.batch_size)),
          validation_steps=int(np.ceil(validate_generator.samples/validate_generator.batch_size)),
          callbacks=[csv_logger, 
                     reduce_lr_on_plateau, 
                     model_checkpoint] 
         )

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3')


2024-05-08 16:26:56.580103: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:766] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Did not find a shardable source, walked to a node which is not a dataset: name: "FlatMapDataset/_2"
op: "FlatMapDataset"
input: "TensorDataset/_1"
attr {
  key: "Targuments"
  value {
    list {
    }
  }
}
attr {
  key: "_cardinality"
  value {
    i: -2
  }
}
attr {
  key: "f"
  value {
    func {
      name: "__inference_Dataset_flat_map_flat_map_fn_1738953"
    }
  }
}
attr {
  key: "metadata"
  value {
    s: "\n\023FlatMapDataset:2257"
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
        dim {
          size: -1
        }
        dim {
          size: -1
        }
        dim {
          size: -1
        }
        dim {
          size: -1
        }
      }
      shape {
        dim {
          size: -1
        }
      }
    }
  }
}
attr {


Epoch 1/50
INFO:tensorflow:batch_all_reduce: 214 all-reduces with algorithm = nccl, num_packs = 1
INFO:tensorflow:batch_all_reduce: 214 all-reduces with algorithm = nccl, num_packs = 1
 278/5111 [>.............................] - ETA: 1:13:03 - loss: 0.3227 - mae: 0.3633

KeyboardInterrupt: 

In [45]:
#Identify Best Epoch

res = pd.read_csv(filepath_or_buffer='./model_development/logs/'+target+','+architecture+','+file+','+meta_data_name+'_Combined_IMAGINE_v2_GoPro.csv')
res = res.sort_values('val_loss', ascending=True).reset_index(drop=True)
print(res)
best_epoch = res.epoch[0]

    epoch      loss       mae  val_loss   val_mae
0      37  0.002740  0.035734  0.028700  0.110716
1      42  0.002756  0.035744  0.028810  0.110747
2      36  0.002749  0.035794  0.028827  0.111200
3      48  0.002743  0.035744  0.028847  0.110824
4      25  0.002737  0.035706  0.028853  0.111212
5      28  0.002741  0.035734  0.028924  0.110994
6      44  0.002741  0.035731  0.028941  0.111206
7      34  0.002748  0.035719  0.028953  0.111552
8      21  0.002775  0.035915  0.028957  0.111451
9      27  0.002744  0.035764  0.028974  0.111251
10     38  0.002754  0.035753  0.028978  0.111134
11     35  0.002758  0.035721  0.029039  0.111433
12     49  0.002752  0.035770  0.029044  0.111044
13     29  0.002769  0.035769  0.029049  0.111070
14     47  0.002738  0.035661  0.029055  0.111442
15     41  0.002749  0.035784  0.029056  0.111262
16     43  0.002738  0.035730  0.029084  0.111223
17     24  0.002754  0.035784  0.029104  0.111609
18     45  0.002760  0.035812  0.029108  0.111370


In [None]:
##### Generate Predictions for compiled data, add them to dataframe, save new dataframe #####

In [9]:
#Load Metadata
#metadata = pd.read_csv("compiled data/metadata_goproUFP_V2_V3_15012024.csv", low_memory=False)

#This version has GoPro data only in trn set (as opposed to random split between trn,val,tst)
#metadata = pd.read_csv("compiled data/metadata_goproUFP_V2_V3_04042024.csv", low_memory=False)
metadata = pd.read_csv("compiled data/metadata_goproUFP_V2_V3_04042024.csv", low_memory=False)

# check to see if each image file exists and remove rows from the results dataframe for which image files don't exist.
metadata['file_exists'] = metadata.apply(lambda row: os.path.isfile(row.image_path), axis = 1)

#Remove missing files from metadata to avoid errors in adding predictions to dataframe
metadata = metadata.loc[metadata['file_exists'] == True]
metadata.head(10)



Unnamed: 0,datetime,image_path,audio_path,ln_ufp_num_10s_ma_image_label_raw,ln_ufp_num_10s_ma_spec_label_raw,ln_ufp_size_10s_ma_image_label_raw,ln_ufp_size_10s_ma_spec_label_raw,ln_noise_10s_ma_image_label_raw,ln_noise_10s_ma_spec_label_raw,ufp_num_10s_ma_image_label_raw,...,ufp_num_10s_ma_spec_label_quartile,ufp_num_10s_ma_image_label_quartile,ufp_size_10s_ma_spec_label_quartile,ufp_size_10s_ma_image_label_quartile,image_extension,audio_extension,image_name,audio_name,pair_pm25,set
0,2019-04-23T10:20:27Z,archived files no longer used/GoPro Model/data...,archived files no longer used/GoPro Model/data...,9.744668,9.744668,3.09874,3.09874,4.319752,4.319752,17063.0,...,,,,,,,,,,trn
1,2019-04-23T10:20:28Z,archived files no longer used/GoPro Model/data...,archived files no longer used/GoPro Model/data...,9.710145,9.710145,3.119276,3.119276,4.309322,4.309322,16484.0,...,,,,,,,,,,trn
2,2019-04-23T10:20:29Z,archived files no longer used/GoPro Model/data...,archived files no longer used/GoPro Model/data...,9.683153,9.683153,3.144583,3.144583,4.327702,4.327702,16045.0,...,,,,,,,,,,trn
3,2019-04-23T10:20:30Z,archived files no longer used/GoPro Model/data...,archived files no longer used/GoPro Model/data...,9.583558,9.583558,3.167583,3.167583,4.320018,4.320018,14524.0,...,,,,,,,,,,trn
4,2019-04-23T10:20:31Z,archived files no longer used/GoPro Model/data...,archived files no longer used/GoPro Model/data...,9.509259,9.509259,3.184698,3.184698,4.303119,4.303119,13484.0,...,,,,,,,,,,trn
5,2019-04-23T10:20:32Z,archived files no longer used/GoPro Model/data...,archived files no longer used/GoPro Model/data...,9.479069,9.479069,3.197448,3.197448,4.27736,4.27736,13083.0,...,,,,,,,,,,trn
6,2019-04-23T10:20:33Z,archived files no longer used/GoPro Model/data...,archived files no longer used/GoPro Model/data...,9.422544,9.422544,3.21165,3.21165,4.273606,4.273606,12364.0,...,,,,,,,,,,trn
7,2019-04-23T10:20:34Z,archived files no longer used/GoPro Model/data...,archived files no longer used/GoPro Model/data...,9.399638,9.399638,3.223664,3.223664,4.26,4.26,12084.0,...,,,,,,,,,,trn
8,2019-04-23T10:20:35Z,archived files no longer used/GoPro Model/data...,archived files no longer used/GoPro Model/data...,9.403107,9.403107,3.235536,3.235536,4.227709,4.227709,12126.0,...,,,,,,,,,,trn
9,2019-04-23T10:20:36Z,archived files no longer used/GoPro Model/data...,archived files no longer used/GoPro Model/data...,9.406976,9.406976,3.241029,3.241029,4.200804,4.200804,12173.0,...,,,,,,,,,,trn


In [10]:
metadata.set.value_counts()

trn        327094
ext_tst     23233
val         20073
tst         19910
Name: set, dtype: int64

In [11]:
#Generate Predictions and Add to Database

#Define Preprocessing functions
architecture_preprocessing_resnet50 = K.applications.resnet50.preprocess_input
architecture_preprocessing_xception = K.applications.xception.preprocess_input


#Define Test Generators
generator_tst_resnet50 = K.preprocessing.image.ImageDataGenerator(preprocessing_function=architecture_preprocessing_resnet50, 
                                                     horizontal_flip=False,
                                                     vertical_flip = False)

generator_tst_xception = K.preprocessing.image.ImageDataGenerator(preprocessing_function=architecture_preprocessing_xception, 
                                                     horizontal_flip=False,
                                                     vertical_flip = False)

test_generator_resnet50_audio = generator_tst_resnet50.flow_from_dataframe(dataframe=metadata[['audio_path']].reset_index(drop=True),
                                                     x_col= 'audio_path',
                                                     class_mode= None,
                                                     target_size=(256, 256),
                                                     color_mode='rgb',
                                                     batch_size=64,
                                                     shuffle=False)

test_generator_resnet50_image = generator_tst_resnet50.flow_from_dataframe(dataframe=metadata[['image_path']].reset_index(drop=True),
                                                     x_col= 'image_path',
                                                     class_mode= None,
                                                     target_size=(256, 256),
                                                     color_mode='rgb',
                                                     batch_size=64,
                                                     shuffle=False)

test_generator_xception_audio = generator_tst_xception.flow_from_dataframe(dataframe=metadata[['audio_path']].reset_index(drop=True),
                                                     x_col= 'audio_path',
                                                     class_mode= None,
                                                     target_size=(256, 256),
                                                     color_mode='rgb',
                                                     batch_size=64,
                                                     shuffle=False)

test_generator_xception_image = generator_tst_xception.flow_from_dataframe(dataframe=metadata[['image_path']].reset_index(drop=True),
                                                     x_col= 'image_path',
                                                     class_mode= None,
                                                     target_size=(256, 256),
                                                     color_mode='rgb',
                                                     batch_size=64,
                                                     shuffle=False)

                                                                       
######### - Generate Predictions for Best CNN Models - ##############



#Best model for log(ufp_num) based on images:  ln_ufp_num, image, ResNet50, raw
ln_ufp_num_ResNet50_image_raw_model = load_model('model_development/models/ln_ufp_num_10s_ma_image_label_raw,ResNet50_linear_Nadam,image_path,GoPro_Training_Only_Combined_IMAGINE_v2_GoPro.hdf5')
metadata['ln_ufp_num_ResNet50_image_raw_prediction'] = ln_ufp_num_ResNet50_image_raw_model.predict(x=test_generator_resnet50_image, 
                                                                                             steps=int(np.ceil(test_generator_resnet50_image.samples/test_generator_resnet50_image.batch_size)))
#Best model for log(ufp_num) based on audio: ln_ufp_num, audio, ResNet50, raw
ln_ufp_num_ResNet50_audio_raw_model = load_model('model_development/models/ln_ufp_num_10s_ma_spec_label_raw,ResNet50_linear_Nadam,audio_path,GoPro_Training_Only_Combined_IMAGINE_v2_GoPro.hdf5')
metadata['ln_ufp_num_ResNet50_audio_raw_prediction'] = ln_ufp_num_ResNet50_audio_raw_model.predict(x=test_generator_resnet50_audio, 
                                                                                             steps=int(np.ceil(test_generator_resnet50_audio.samples/test_generator_resnet50_audio.batch_size)))
#Best model for log(ufp_size) based on images: ln_ufp_size, image, ResNet50, raw
ln_ufp_size_ResNet50_image_raw_model = load_model('model_development/models/ln_ufp_size_10s_ma_image_label_raw,ResNet50_linear_Nadam,image_path,GoPro_Training_Only_Combined_IMAGINE_v2_GoPro.hdf5')
metadata['ln_ufp_size_ResNet50_image_raw_prediction'] = ln_ufp_size_ResNet50_image_raw_model.predict(x=test_generator_resnet50_image, 
                                                                                             steps=int(np.ceil(test_generator_resnet50_image.samples/test_generator_resnet50_image.batch_size)))
#Best model for log(ufp_size) based on audio: ln_ufp_size, audio, Xception, raw
ln_ufp_size_Xception_audio_raw_model = load_model('model_development/models/ln_ufp_size_10s_ma_spec_label_raw,Xception_linear_Nadam,audio_path,GoPro_Training_Only_Combined_IMAGINE_v2_GoPro.hdf5')
metadata['ln_ufp_size_Xception_audio_raw_prediction'] = ln_ufp_size_Xception_audio_raw_model.predict(x=test_generator_xception_audio,
                                                                                                     steps=int(np.ceil(test_generator_xception_audio.samples/test_generator_xception_audio.batch_size)))
#Best model for log(noise) based on images: ln_noise, image, ResNet50, raw
ln_noise_ResNet50_image_raw_model = load_model('model_development/models/ln_noise_10s_ma_image_label_raw,ResNet50_linear_Nadam,image_path,GoPro_Training_Only_Combined_IMAGINE_v2_GoPro.hdf5')
metadata['ln_noise_ResNet50_image_raw_prediction'] = ln_noise_ResNet50_image_raw_model.predict(x=test_generator_resnet50_image,
                                                                                               steps=int(np.ceil(test_generator_resnet50_image.samples/test_generator_resnet50_image.batch_size)))

#Best model for log(noise) based on audio: ln_noise, audio, Xception, raw
ln_noise_Xception_audio_raw_model = load_model('model_development/models/ln_noise_10s_ma_spec_label_raw,Xception_linear_Nadam,audio_path,GoPro_Training_Only_Combined_IMAGINE_v2_GoPro.hdf5')
metadata['ln_noise_Xception_audio_raw_prediction'] = ln_noise_Xception_audio_raw_model.predict(x=test_generator_xception_audio, 
                                                                                             steps=int(np.ceil(test_generator_xception_audio.samples/test_generator_xception_audio.batch_size)))               

#Save dataframe with all predictions added
metadata.to_csv(path_or_buf='compiled data/metadata_gopro_training_only_UFP_V2_V3_04042024_predictions_added.csv', index=False)





Found 390310 validated image filenames.
Found 390310 validated image filenames.
Found 390310 validated image filenames.
Found 390310 validated image filenames.
