In [1]:
import os

import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="3"

import pandas as pd
from tqdm.notebook import tqdm
from PIL import Image
import pandas as pd
import os
import numpy as np

xy_train_df = pd.read_csv('train_xy.csv')
x_test_df = pd.read_csv('test_x.csv')

In [2]:
from sklearn.model_selection import train_test_split

# loading summary: (force convert some of the non-string cell to string)
x_text = xy_train_df.summary.astype('str')

# labels:
y_price = xy_train_df.price

len_price = len(y_price.unique())
print('unique values for price category', len_price, y_price.unique())

# splitting:

x_tr_text, x_vl_text, y_tr_price, y_vl_price = train_test_split(
    x_text,
    y_price,
    test_size=0.2)

print(np.shape(x_tr_text))
print(np.shape(x_vl_text))
print(np.shape(y_tr_price))
print(np.shape(y_vl_price))

unique values for price category 3 [1 0 2]
(6101,)
(1526,)
(6101,)
(1526,)


In [3]:
# preprocess text data

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pprint import pprint

vocab_size = 40000
max_len = 100


# build vocabulary from training set
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(x_tr_text)


def _preprocess(list_of_text):
    return pad_sequences(
        tokenizer.texts_to_sequences(list_of_text),
        maxlen=max_len,
        padding='post',
    )
    

# padding is done inside: 
x_tr_text_id = _preprocess(x_tr_text)
x_vl_text_id = _preprocess(x_vl_text)

print(x_tr_text_id.shape)
print(x_vl_text_id.shape)

(6101, 100)
(1526, 100)


In [4]:
pprint(tokenizer.sequences_to_texts(x_tr_text_id[:5]))

['we are two latin friends pablo and daniel good vibes living together animal '
 "and gay friendly it's probably you see 2 cats and 1 dog in the case you're "
 'alergic to them but anyway they are at home each 2 weeks there is a small '
 'kitchen bathroom and the others 2 rooms occupied for us we like to share '
 'talk and know about the experiences of the travelers we receive in english '
 'french spanish',
 'magnificent condo located on the 23rd floor of tour des canadiens this '
 'luxurious condo offers a stunning east side view of montreal there is direct '
 'acces to lucien allier subway station and the bell center condo is located '
 'within walking distance to the bonaventure subway station windsor train '
 'station ste catherine street old montreal notre dame basilica and queen '
 'marie of the world cathedral mcgill and concordia universities and a variety '
 'of shops movie theaters fine dining',
 'this apartment bright and modernly decorated is located near several '
 'servi

In [5]:
print('total words in the dictionary:', tokenizer.num_words)

total words in the dictionary: 40000


In [6]:
from __future__ import absolute_import, division, print_function, unicode_literals
import collections
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Conv2D, Flatten, Dense, MaxPool2D, GRU
from tensorflow.keras.optimizers import Adam

in_text = keras.Input(batch_shape=(None, max_len))

# text part
embedded = keras.layers.Embedding(tokenizer.num_words, 100)(in_text)
z = GRU(32)(embedded)


# multi-objectives (each is a multi-class classification)
p_price = Dense(len_price, activation='softmax')(z)


model = keras.Model(
    inputs=in_text,
    outputs=p_price,
)


model.compile(
    optimizer=Adam(),
    loss='sparse_categorical_crossentropy',
    metrics='SparseCategoricalAccuracy',
)


model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 100, 100)          4000000   
_________________________________________________________________
gru (GRU)                    (None, 32)                12864     
_________________________________________________________________
dense (Dense)                (None, 3)                 99        
Total params: 4,012,963
Trainable params: 4,012,963
Non-trainable params: 0
_________________________________________________________________


2022-11-04 21:11:45.752816: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-04 21:11:46.441592: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 21015 MB memory:  -> device: 0, name: Quadro RTX 6000, pci bus id: 0000:3e:00.0, compute capability: 7.5


In [7]:
history = model.fit(
    x=x_tr_text_id,
    y=y_tr_price,
    epochs=20,
    batch_size=16,
    validation_data=(x_vl_text_id, y_vl_price),
    callbacks=[
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, )
    ],
    verbose=1
)

Epoch 1/20


2022-11-04 21:11:47.058899: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2022-11-04 21:11:49.131672: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8101


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
