In [1]:
import os
from sys import getsizeof
import gc

import numpy as np
import pandas as pd
import cv2 as cv
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [3]:
train_path = "new_train.csv"
image_path = "new_train"
test_path = "test.csv"

In [4]:
class BaseModel:
    def __init__(self, name, model):
        self.name = name
        self.model = model

    def train(self, train_data, val_data, y_train, y_val, batch_size=1000):
        for i in range(0, len(train_data), batch_size):
            start = i
            end = min(i + batch_size, len(train_data))
            print(f'Training {self.name} model on images {start} to {end}...')
    
            train_subset = train_data[start:end]
            val_subset = val_data[start:end]

            checkpoint_path = f"{self.name}_checkpoint.h5"
            checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
            
            self.model.fit(train_subset, y_train, epochs=50, batch_size=16, 
                           validation_data=(val_subset, y_val), 
                           callbacks=[checkpoint], verbose=0)

In [15]:
class Modelling:
    def __init__(self, train_path, test_path, image_path):
        self.train_data = pd.read_csv(train_path)
        self.test_data = pd.read_csv(test_path) 
        self.image_path = image_path
        self.image_amount = len(os.listdir(image_path))
        
        self.X_train_csv, self.X_val_csv, self.X_train_img, self.X_val_img, self.y_train, self.y_val = None, None, None, None, None, None
        self.X_test = None

        self.checkpoint_path = None
        self.checkpoint = None

        self.models = {'NN': 
                           {'function': self.create_NN, 
                            'model': None,
                            'data': None,
                            'history': pd.DataFrame({'loss': [], 'val_loss': []})},  
                       'CNN': 
                           {'function': self.create_CNN, 
                            'model': None,
                            'data': None,
                            'history': pd.DataFrame({'loss': [], 'val_loss': []})}, 
                       'Transfer': 
                           {'function': self.create_Transfer, 
                            'model': None,
                            'data': None,
                            'history': pd.DataFrame({'loss': [], 'val_loss': []})}, 
                       'MultiModal': 
                           {'function': self.create_MultiModal, 
                            'model': None,
                            'data': None,
                            'history': pd.DataFrame({'loss': [], 'val_loss': []})}}

        self.optimizer = keras.optimizers.Adam(learning_rate=0.001)
        self.early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    
    def prepare_data(self, start, end):
        amount_of_images = end - start
        scaler = StandardScaler() 
        
        X = self.train_data.drop(['Price'], axis=1)
        y = self.train_data['Price']
        
        X_scaled = scaler.fit_transform(X)

        images = np.empty((amount_of_images, 512, 512, 3), dtype=np.uint8)
        for i, image in enumerate(os.listdir(self.image_path)[start:end]):
            image = cv.imread(f'{self.image_path}/{image}')
            images[i, ...] = image

        self.X_train_csv, self.X_val_csv, self.X_train_img, self.X_val_img, self.y_train, self.y_val = \
        train_test_split(X_scaled[start:end], images, y[start:end], test_size=0.2, random_state=42)

        self.models['NN']['data'] = (self.X_train_csv, self.X_val_csv)
        self.models['CNN']['data'] = (self.X_train_img, self.X_val_img)
        self.models['Transfer']['data'] = ([self.X_train_img,  self.X_train_csv], [self.X_val_img, self.X_val_csv])
        self.models['MultiModal']['data'] = ([self.X_train_img,  self.X_train_csv], [self.X_val_img, self.X_val_csv])
        
        #Standariseren 
        self.X_test = scaler.transform(self.test_data)

    def create_NN(self):
        self.models['NN']['model'] = keras.Sequential()

        self.models['NN']['model'].add(layers.Input(shape=(6)))
        self.models['NN']['model'].add(layers.Dense(256, activation='relu'))
        self.models['NN']['model'].add(layers.Dropout(0.5))
        self.models['NN']['model'].add(layers.Dense(128, activation='relu'))
        self.models['NN']['model'].add(layers.Dropout(0.3))
        self.models['NN']['model'].add(layers.Dense(64, activation='relu'))
        self.models['NN']['model'].add(layers.Dense(1))
        
        self.models['NN']['model'].compile(optimizer=self.optimizer, loss='mean_absolute_percentage_error')

    def create_CNN(self):
        pass

    def create_Transfer(self):
        pass
        
    def create_MultiModal(self):
        img_input = layers.Input(shape=(512, 512, 3))
        csv_input = layers.Input(shape=(6))
        
        # define layers for image data 
        x_img = layers.experimental.preprocessing.Rescaling(1./255)(img_input)
        x_img = layers.Conv2D(16, 3, padding='same', activation='relu')(x_img)
        x_img = layers.MaxPooling2D()(x_img)
        x_img = layers.Conv2D(32, 3, padding='same', activation='relu')(x_img)
        x_img = layers.MaxPooling2D()(x_img)
        x_img = layers.Conv2D(64, 3, padding='same', activation='relu')(x_img)
        x_img = layers.MaxPooling2D()(x_img)
        x_img = layers.Flatten()(x_img)
        
        # define layers for csv data
        x_csv = layers.Flatten()(csv_input)
        x_csv = layers.Dense(16, activation='relu')(x_csv)
        x_csv = layers.Dense(32, activation='relu')(x_csv)
        x_csv = layers.Dense(64, activation='relu')(x_csv)
        
        # merge layers
        x = layers.concatenate([x_img, x_csv])
        x = layers.Dense(128, activation='relu')(x)
        output = layers.Dense(1)(x)
        
        # make model with 2 inputs and 1 output
        self.models['MultiModal']['model'] = tf.keras.models.Model(inputs=[img_input, csv_input], outputs=output)

        self.models['MultiModal']['model'].compile(optimizer=self.optimizer, loss='mean_absolute_percentage_error')
        
    def train_model(self, model, batch_size=1000):  
        try:
            self.models[model]['function']()
            print('model')
        except:
            print('This model does not exist!')
            
        for i in range(0, 200, batch_size):
            self.models[model]['function']()
            start = i
            end = min(i + batch_size, self.image_amount)
            print(f'Training {model} on images {start} to {end}...')
    
            tf.keras.backend.clear_session()
            
            self.prepare_data(start, end)
            
            self.checkpoint_path = f"{model}_checkpoint.h5"
            self.checkpoint = ModelCheckpoint(self.checkpoint_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

            print(self.models[model]['model'])
            
            self.models[model]['model'].fit(self.models[model]['data'][0], self.y_train, epochs=1, batch_size=16, \
                                            validation_data=(self.models[model]['data'][1], self.y_val), \
                                            callbacks=[self.early_stopping, self.checkpoint], verbose=0)
            gc.collect()

            history = pd.DataFrame(self.models[model]['model'].history.history)
            self.models[model]['history'] = pd.concat([self.models[model]['history'], history], ignore_index=True)
    def evaluate_models(self):
        fig, ax = plt.subplots(1, 2)
        for model in self.models.keys():
            plt.tight_layout()
            ax[0].plot(self.models[model]['history']['loss'], label=f'{model} Train Loss')
            ax[0].set_xlabel('Epochs')
            ax[0].set_ylabel('Mean Absolute Percentage Error (mape)')
            ax[0].legend()
            
            ax[1].plot(self.models[model]['history']['val_loss'], label=f'{model} Val Loss')
            ax[1].set_xlabel('Epochs')
            ax[1].set_ylabel('Mean Absolute Percentage Error (mape)')
            ax[1].legend()
        else:
            print('No models trained...')
            

In [16]:
modelling = Modelling(train_path, test_path, image_path)

In [17]:
modelling.train_model('NN', batch_size=100)
modelling.train_model('MultiModal', batch_size=100)

model
Training NN on images 0 to 100...
<keras.engine.sequential.Sequential object at 0x0000021330FFAA10>

Epoch 1: val_loss improved from inf to 99.99998, saving model to NN_checkpoint.h5
Training NN on images 100 to 200...
<keras.engine.sequential.Sequential object at 0x000002138F0B27D0>

Epoch 1: val_loss improved from inf to 99.99968, saving model to NN_checkpoint.h5
model
Training MultiModal on images 0 to 100...
<keras.engine.functional.Functional object at 0x0000021328819210>

Epoch 1: val_loss improved from inf to 99.85835, saving model to MultiModal_checkpoint.h5


ValueError: Unable to synchronously create dataset (name already exists)

In [None]:
modelling.evaluate_models()