In [42]:
from pandas import read_csv
from keras.models import Sequential, Model
from keras import regularizers
from keras import optimizers, initializers
from keras import callbacks
import sklearn.metrics
from keras.layers import Activation,Dense, Input, GlobalMaxPooling1D, BatchNormalization, Dropout,Conv1D, MaxPooling1D, Embedding, concatenate,Reshape
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import math
from keras.initializers import Constant
import datetime

In [2]:
df = pd.read_csv('./pairs/listings.csv') 
df = shuffle(df, random_state=0)
print(df.shape)

text_cols = ['name', 'summary', 'space', 'description', 'neighborhood_overview', 'notes', 'transit', 'access', 
             'interaction', 'house_rules', 'host_name', 'host_about']

  interactivity=interactivity, compiler=compiler, result=result)


(59881, 96)


In [3]:
embeddings_index = {}
with open( 'glove.6B.50d.txt', encoding='utf-8') as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [4]:
def clean_date(cols):
    num_data = cols.shape[0]
    #print(num_data)
    dates = pd.to_datetime(cols)
    min_date = pd.to_datetime(date.today())
    for col in dates:
        if not isinstance(col, float):
            min_date = min(min_date, col)
    mean_date = dates.mean()
    
    dif = pd.to_timedelta([mean_date - min_date]).astype('timedelta64[h]')[0]
    arr = np.zeros((num_data, 1))
    for i, col in enumerate(dates):
        arr[i] = pd.to_timedelta([col - min_date]).astype('timedelta64[h]')[0]
        if np.isnan(arr[i]):
            arr[i] = dif

    # print (np.min(arr), np.max(arr))
    return arr

def clean_host_response_rate(host_response_rate, num_data):
    total = 0
    count = 0
    for col in host_response_rate:
        if not isinstance(col, float):
            total += float(col.strip('%'))
            count += 1

    arr = np.zeros(num_data)
    mean = total / count
    print (host_response_rate.name, 'mean is ', mean)
    for i, col in enumerate(host_response_rate):
        if not isinstance(col, float):
            arr[i] += float(col.strip('%'))
        else:
            assert(math.isnan(col))
            arr[i] = mean
    return arr

def clean_price(price, num_data):
    '''
    total = 0
    count = 0
    all_count=0
    for col in price:
        all_count+=1
        if not isinstance(col, float):
            total += float(col.strip('$').replace(',', ''))
            count += 1
    
    
    mean = total / count
    print (price.name, 'mean is ', mean)
    print(all_count-count)
    '''
    arr = np.zeros(num_data)
    for i, col in enumerate(price):
        if not isinstance(col, float):
            arr[i] += float(col.strip('$').replace(',', ''))
        else:
            assert(math.isnan(col))
            arr[i] = 0
    return arr

def check_nan(cols):
    for col in cols:
        #print (col)
        if np.isnan(col):
            return True
    return False

def to_np_array_fill_NA_mean(cols):
    print (cols.name, 'mean is ', np.nanmean(np.array(cols)))
    return np.array(cols.fillna(np.nanmean(np.array(cols))))

In [5]:
num_data = df.shape[0]

features = ['host_listings_count', 'host_total_listings_count', 'latitude', 'longitude', 
      'accommodates', 'bathrooms', 'bedrooms', 'beds', 'square_feet',     
      'guests_included', 'minimum_nights', 'maximum_nights', 'availability_30', 'availability_60', 
      'availability_90', 'availability_365', 'number_of_reviews', 'review_scores_rating', 'review_scores_accuracy', 
      'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 
      'review_scores_value', 'calculated_host_listings_count', 'reviews_per_month']

      
      
price_features = ['security_deposit', 'cleaning_fee', 'extra_people','price'] 

arr = np.zeros((len(features) + len(price_features) + 1, num_data))
# check_nan(df['extra_people'])

host_response_rate = clean_host_response_rate(df['host_response_rate'], num_data)
arr[0] = host_response_rate
print("num_data", num_data)
i = 0
for feature in features:
    i += 1
    if check_nan(df[feature]):
        arr[i] = to_np_array_fill_NA_mean(df[feature])
    else:
        arr[i] = np.array(df[feature])
    

for feature in price_features:
    i += 1
    arr[i] = clean_price(df[feature], num_data)

label = arr[-1]
arr = arr[:-1].T
#arr = arr.T
print(arr.shape)

host_response_rate mean is  91.81647347518647
num_data 59881
host_listings_count mean is  8.456983949359477
host_total_listings_count mean is  8.456983949359477
bathrooms mean is  1.108602959618761
bedrooms mean is  1.087790279705081
beds mean is  1.671634284947949
square_feet mean is  383.66568483063327
review_scores_rating mean is  92.82449394024498
review_scores_accuracy mean is  9.574485115957078
review_scores_cleanliness mean is  9.203351713698778
review_scores_checkin mean is  9.688236058191894
review_scores_communication mean is  9.719251568245728
review_scores_location mean is  9.59474379234949
review_scores_value mean is  9.273250779355733
reviews_per_month mean is  1.163733144822638
(59881, 30)


In [6]:

trainX_nontext = np.loadtxt(open('./model/trainX_nontext.csv','r'), delimiter = ',', skiprows=0)
trainy_nontext = np.loadtxt(open('./model/trainy_nontext.csv','r'), delimiter = ',', skiprows=0)

devX_nontext = np.loadtxt(open('./model/devX_nontext.csv','r'), delimiter = ',', skiprows=0)
devy_nontext = np.loadtxt(open('./model/devy_nontext.csv','r'), delimiter = ',', skiprows=0)

testX_nontext = np.loadtxt(open('./model/testX_nontext.csv','r'), delimiter = ',', skiprows=0)
testy_nontext = np.loadtxt(open('./model/testy_nontext.csv','r'), delimiter = ',', skiprows=0)

In [35]:
print(trainX_nontext.shape)
print(trainy_nontext.shape)
trainy_nontext = trainy_nontext.reshape((-1,1))
devy_nontext = devy_nontext.reshape((-1,1))
testy_nontext = testy_nontext.reshape((-1,1))
print(trainy_nontext.shape)
print(devy_nontext.shape)

(41909, 381)
(41909, 1)
(41909, 1)
(11974, 1)


In [36]:
mask = label > 0
texts = df['description'][mask].values.astype('U')
num = 0
n = 0
for i in texts:
    l = list(i)
    len_l = len(l)
    num = num + len_l
    n = num/59870
print(num)
print(n)

43228441
722.0384332720895


In [48]:
mask = label > 0
texts = df['description'][mask].values.astype('U')
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=300)

Found 65633 unique tokens.


In [49]:
num_data = data.shape[0]
train_num = int(num_data * 0.7)
dev_num = int(num_data * 0.9)
test_num = num_data - train_num

trainX_text = data[:train_num]
devX_text = data[train_num:dev_num]
testX_text = data[dev_num:]

# prepare embedding matrix
num_words = min(20000, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, 50))
for word, i in word_index.items():
    if i >= 20000:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [50]:
print(data.shape)

(59870, 300)


In [51]:
print(trainX_text.shape)
print(devX_text.shape)
print(testX_text.shape)
print(data.shape)
print(len(texts))
print(num_data)

(41909, 300)
(11974, 300)
(5987, 300)
(59870, 300)
59870
59870


In [52]:
# GloVe
init = initializers.glorot_uniform(seed=0)

# non-text
nontext_input = Input(shape=(381,))
d1 = Dense(300, activation='relu', kernel_initializer=init, kernel_regularizer=regularizers.l2(0.01))(nontext_input)
d2 = Dense(200, activation='relu')(d1)
d3 = Dense(100, activation='relu')(d2)
d4 = Dense(1)(d3)

# text
text_input = Input(shape=(300,), dtype='int32')
embedding_layer = Embedding(num_words,
                            50,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=300,
                            trainable=False)(text_input)
# train a 1D convnet with global maxpooling
x = Conv1D(128, 5)(embedding_layer)
bn = BatchNormalization()(x)
act = Activation('relu')(bn)
dropout = Dropout(0.2)(act)
x = MaxPooling1D(5)(dropout)
x = GlobalMaxPooling1D()(x)
dense1 = Dense(128, kernel_initializer=init, kernel_regularizer=regularizers.l2(0.01))(x)
dense2 = Dense(64, activation='relu')(dense1)
dense3 = Dense(1)(dense2)

merger = concatenate([d4,dense3])

out = Dense(1)(merger)

model = Model(inputs=[nontext_input, text_input], outputs=out)

model.compile(loss= "mean_squared_error" , optimizer="adam", metrics=["accuracy","mse"])
earlystopping = callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=0, verbose=0, mode='auto', baseline=None, restore_best_weights=True)

starttime = datetime.datetime.now()
model.fit([trainX_nontext,trainX_text], trainy_nontext, epochs=100, batch_size=200, validation_data=[[devX_nontext,devX_text],devy_nontext],  callbacks=[earlystopping])
endtime = datetime.datetime.now()

train_predict = model.predict([trainX_nontext,trainX_text])

test_predict = model.predict([testX_nontext,testX_text])

mse_train = sklearn.metrics.mean_squared_error(trainy_nontext, train_predict)
mse_test = sklearn.metrics.mean_squared_error(testy_nontext, test_predict)

r2_train = sklearn.metrics.r2_score(trainy_nontext, train_predict)
r2_test = sklearn.metrics.r2_score(testy_nontext, test_predict)

print(mse_train,mse_test,r2_train,r2_test)

print('train score : ',r2_train)
print('test score : ',r2_test)
print('train mse : ', mse_train)
print('test mse : ', mse_test)
print("Run time : ",(endtime - starttime).seconds)

Train on 41909 samples, validate on 11974 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
0.10950135381969013 0.13355662797024417 0.7245206974781864 0.6671167476665048
train score :  0.7245206974781864
test score :  0.6671167476665048
train mse :  0.10950135381969013
test mse :  0.13355662797024417
Run time :  2946


In [44]:
trainX_text = np.loadtxt(open('./model/trainX_text.csv','r'), delimiter = ',', skiprows=0)
trainy_text = np.loadtxt(open('./model/trainy_text.csv','r'), delimiter = ',', skiprows=0)

devX_text = np.loadtxt(open('./model/devX_text.csv','r'), delimiter = ',', skiprows=0)
devy_text = np.loadtxt(open('./model/devy_text.csv','r'), delimiter = ',', skiprows=0)

testX_text = np.loadtxt(open('./model/testX_text.csv','r'), delimiter = ',', skiprows=0)
testy_text = np.loadtxt(open('./model/testy_text.csv','r'), delimiter = ',', skiprows=0)

In [45]:
print(trainX_text)

[[ 0.74886466 -0.9793375  -0.09167473 ...  1.03600904 -0.30108228
   0.18866085]
 [-0.78240987  1.18475344 -0.0916559  ...  0.75598606 -1.56942359
   0.12484921]
 [-1.08085512  0.80438886 -0.09164898 ... -0.4857522  -0.38721802
  -0.44720533]
 ...
 [-0.48964957 -0.64927438 -0.09167856 ...  1.03885743  1.1014024
  -2.2090919 ]
 [ 0.26539369 -0.63822341 -0.09166126 ... -1.43413554  1.17728637
   0.91319473]
 [-0.5965585  -0.64495683 -0.09163774 ...  0.64721921 -0.78980207
  -0.45136055]]


In [47]:
init = initializers.glorot_uniform(seed=0)

# non-text
nontext_input = Input(shape=(381,))
d1 = Dense(300, activation='relu', kernel_initializer=init, kernel_regularizer=regularizers.l2(0.01))(nontext_input)
d2 = Dense(200, activation='relu')(d1)
d3 = Dense(100, activation='relu')(d2)
d4 = Dense(1)(d3)

# text
text_input = Input(shape=(300,))
dense1 = Dense(128, activation='relu', kernel_initializer=init, kernel_regularizer=regularizers.l2(0.01))(text_input)
dense2 = Dense(64, activation='relu')(dense1)
dense3 = Dense(1)(dense2)

merger = concatenate([d4,dense3])

out = Dense(1)(merger)

model = Model(inputs=[nontext_input, text_input], outputs=out)

model.compile(loss= "mean_squared_error" , optimizer="adam", metrics=["accuracy","mse"])
earlystopping = callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=0, verbose=0, mode='auto', baseline=None, restore_best_weights=True)

starttime = datetime.datetime.now()
model.fit([trainX_nontext,trainX_text], trainy_nontext, epochs=100, batch_size=200, validation_data=[[devX_nontext,devX_text],devy_nontext],  callbacks=[earlystopping])
endtime = datetime.datetime.now()

train_predict = model.predict([trainX_nontext,trainX_text],batch_size=200,callbacks=[earlystopping])

test_predict = model.predict([testX_nontext,testX_text],batch_size=200,callbacks=[earlystopping])

mse_train = sklearn.metrics.mean_squared_error(trainy_nontext, train_predict)
mse_test = sklearn.metrics.mean_squared_error(testy_nontext, test_predict)

r2_train = sklearn.metrics.r2_score(trainy_nontext, train_predict)
r2_test = sklearn.metrics.r2_score(testy_nontext, test_predict)

print(mse_train,mse_test,r2_train,r2_test)

print('train score : ',r2_train)
print('test score : ',r2_test)
print('train mse : ', mse_train)
print('test mse : ', mse_test)
print("Run time : ",(endtime - starttime).seconds)

Train on 41909 samples, validate on 11974 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
0.10700737021403643 0.11496213579814898 0.730794965696985 0.7134625945466448
train score :  0.730794965696985
test score :  0.7134625945466448
train mse :  0.10700737021403643
test mse :  0.11496213579814898
Run time :  32
