In [1]:
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd
import os
import PIL
import re
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, models, activations, metrics, optimizers, losses, utils, preprocessing
from tensorflow.keras.preprocessing.image import load_img, img_to_array

2021-12-22 03:57:48.198807: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-22 03:57:48.198831: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


The first big question: What library to use to read in images. Some options are: 

1. Either use matplotlibs imread, directly loads an image as numpy array and installed everywhere
2. Use pythons standard PIL (Pillow) library
3. Use tf.keras.preprocessing.image.ImageDataGenerator
4. Use skimage 

Each lib has its pros and cons. In the end the main focus here will be a CNN, so TF functionality will be important. 
Even for the other ML algorithms it may be wise to use the Convolutional Layers of the CNN to reduce the dimensionality of the data. (Note: TF internally uses Pillow.)

First, lets get familiar with the dataset here, look at the dimensions of the images (without reading them in)

In [2]:
SOURCE_DIR = "photoz_images/"
imgs = [PIL.Image.open(SOURCE_DIR+filename).size for filename in os.listdir(SOURCE_DIR)]

# see how many images of which size we have
sizes, counts = np.unique(imgs, axis=0, return_counts=True)
sizes, counts

(array([[72, 72]]), array([57720]))

As all images are of the same size, we won't need heavy preprosessing, as we can use the original size, since 72x72 is quite handable. (Let's hope I don't regret saying that later)

In [3]:
RE = re.compile(r"^(?P<ID>\d+)_z_(?P<redshift>\d+\.\d*(e-\d+)?).jpg$")
def load_image(source_dir, filename):
    img = img_to_array(load_img(source_dir+filename), dtype=np.float16)/255
    matches = RE.match(filename)
    return img, float(matches.group("redshift"))


In [4]:
dirlist = list(os.listdir(SOURCE_DIR))
images = np.empty((len(dirlist), 72, 72, 3), np.float16)
redshifts = np.empty((len(dirlist),), np.float32)
for idx, filename in enumerate(tqdm(dirlist)):
    image, redshift = load_image(SOURCE_DIR, filename)
    images[idx] = image
    redshifts[idx] = redshift

  0%|          | 0/57720 [00:00<?, ?it/s]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(images, redshifts)

In [6]:
# test CNN
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation=activations.swish, input_shape=(72, 72, 3)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(32, (3, 3), activation=activations.swish))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(16, (3, 3), activation=activations.swish))
model.add(layers.MaxPooling2D((3, 3)))
model.add(layers.Flatten())
model.add(layers.Dense(60, activation=activations.swish))
model.add(layers.Dropout(0.8))
model.add(layers.Dense(60, activation=activations.swish))
model.add(layers.Dense(1, activation=activations.linear))

2021-12-22 03:58:18.966823: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-12-22 03:58:18.966862: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2021-12-22 03:58:18.966895: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (mobile): /proc/driver/nvidia/version does not exist
2021-12-22 03:58:18.968719: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
model.compile(optimizer=optimizers.Adam(learning_rate=1e-3),
              loss=losses.MeanSquaredError(),
              metrics=[metrics.MeanAbsoluteError()])

In [8]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 70, 70, 32)        896       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 35, 35, 32)       0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 33, 33, 32)        9248      
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 16, 16, 32)       0         
 2D)                                                             
                                                                 
 conv2d_2 (Conv2D)           (None, 14, 14, 16)        4624      
                                                                 
 max_pooling2d_2 (MaxPooling  (None, 4, 4, 16)         0

In [9]:
generator = preprocessing.image.ImageDataGenerator()

<keras.preprocessing.image.NumpyArrayIterator at 0x7efcf64bad30>

In [11]:
model.fit(generator.flow(X_train, y_train))



<keras.callbacks.History at 0x7efc241f8e50>

In [16]:
model.predict(X_test[:5]), y_test[:5]

(array([[0.36915752],
        [0.3828466 ],
        [0.05629794],
        [0.12514882],
        [0.18127477]], dtype=float32),
 array([0.323337 , 0.619709 , 0.0361306, 0.076164 , 0.0831686],
       dtype=float32))