# Evaluate results on Test Set

In [2]:
import pandas as pd
import re
import numpy as np
import nltk
from gensim.models import Word2Vec
from nltk.corpus import stopwords
import gensim
from collections import Counter
stop_words = stopwords.words('english')
stopwords_dict = Counter(stop_words)

In [3]:
df = pd.read_csv('cleaned_test_data.csv')

## Apply Word2Vec on Test Set

### Description columns

In [3]:
# Load Model
model_description = Word2Vec.load("TrainWord2vecDescription.model")

In [4]:
# Clean string data for description_id1 column
description_id1 = []
for i in df['description_id1']:
    description_id1.append(re.sub(r'\W+', ' ', i ).lower())

In [1]:
# Tokenize description_id1
description_id1 = [nltk.word_tokenize(sentence) for sentence in description_id1]

In [None]:
# Remove Stopwords from description_id1
for i in range(len(description_id1)):
    description_id1[i] = [word for word in description_id1[i] if word not in stopwords_dict]

In [None]:
# Clean string data for description_id2 column
description_id2 = []
for i in df['description_id2']:
    description_id2.append(re.sub(r'\W+', ' ', i ).lower())

In [None]:
# Tokenize description_id2
description_id2 = [nltk.word_tokenize(sentence) for sentence in description_id2]

In [None]:
# Remove Stopwords from description_id2
for i in range(len(description_id2)):
    description_id2[i] = [word for word in description_id2[i] if word not in stopwords_dict]

In [None]:
# Combine tokenized columns
description = description_id1 + description_id2

In [None]:
model_description.build_vocab(description, update=True)

In [None]:
model_description.train(description, total_examples=model_description.corpus_count ,epochs=1)

In [None]:
# Save Model
model_description.save("TestWord2vecDescription.model")

#### Description_id1 column 

In [None]:
# Make sure to divide by len(vec) so sentence length doesn't mess things up. Instead we want to focus on the word similarity. 
# Few sentences are length 0 as in no description so to avoid dividing by 0 we'll just append a (100,1) vector of ones for simplicity
description_1_vector_sums = []
for i in range(len(description_id1)):
    vec = []
    for word in description_id1[i]:
        vec.append(model_description.wv[word])
    if len(vec) > 0:
        description_1_vector_sums.append(sum(vec)/len(vec))
    else:
        description_1_vector_sums.append(np.ones(100))

In [23]:
test_d1np = np.asarray(description_1_vector_sums)

In [25]:
len(test_d1np)

215438

In [27]:
np.save('test_d1np.npy' , test_d1np)

#### Description_id2 column 

In [22]:
# Make sure to divide by len(vec) so sentence length doesn't mess things up. Instead we want to focus on the word similarity. 
# Few sentences are length 0 as in no description so to avoid dividing by 0 we'll just append a (100,1) vector of ones for simplicity
description_2_vector_sums = []
for i in range(len(description_id2)):
    vec = []
    for word in description_id2[i]:
        vec.append(model_description.wv[word])
    if len(vec) > 0:
        description_2_vector_sums.append(sum(vec)/len(vec))
    else:
        description_2_vector_sums.append(np.ones(100))

In [28]:
test_d2np = np.asarray(description_2_vector_sums)

In [29]:
len(test_d2np)

215438

In [30]:
np.save('test_d2np.npy' , test_d2np)

### Title columns

In [39]:
# Load Model
model_title = Word2Vec.load("TrainWord2vecTitle.model")

In [31]:
# Clean string data for title_id1 column
title_1 = []
for i in df['title_id1']:
    title_1.append(re.sub(r'\W+', ' ', i.lower()))

In [32]:
# Tokenize words in title_1
title_1 = [nltk.word_tokenize(sentence) for sentence in title_1]

In [33]:
# Remove stopwords
for i in range(len(title_1)):
    title_1[i] = [word for word in title_1[i] if word not in stopwords_dict]

In [34]:
# Clean string data for title_id2 column
title_2 = []
for i in df['title_id2']:
    title_2.append(re.sub(r'\W+', ' ', i.lower()))

In [35]:
# Tokenize words in title_2
title_2 = [nltk.word_tokenize(sentence) for sentence in title_2]

In [36]:
# Remove stopwords
for i in range(len(title_2)):
    title_2[i] = [word for word in title_2[i] if word not in stopwords_dict]

In [37]:
title = title_1 + title_2

In [38]:
len(title)

430876

In [42]:
model_title.build_vocab(title, update=True)

In [43]:
model_title.train(title, total_examples=model_title.corpus_count ,epochs=1)

(1936111, 2900671)

In [45]:
# Save Model
model_title.save("TestWord2vecTitle.model")

#### Apply model on Title_id1 column 

In [46]:
# Make sure to divide by len(vec) so sentence length doesn't mess things up. Instead we want to focus on the word similarity. 
# If sentences are of length 0 as in no description, to avoid dividing by 0 I'll just add a 1 and the resulting sum will be 0
title_1_vector_sums = []
for i in range(len(title_1)):
    vec = []
    for word in title_1[i]:
        vec.append(model_title.wv[word])
    if len(vec) > 0:
        title_1_vector_sums.append(sum(vec)/len(vec))
    else:
        title_1_vector_sums.append(sum(vec)/(len(vec)+1))

In [47]:
test_t1np = np.asarray(title_1_vector_sums)

In [48]:
len(test_t1np)

215438

In [49]:
np.save('test_t1np.npy' , test_t1np)

#### Apply model on Title_id2 column 

In [50]:
# Make sure to divide by len(vec) so sentence length doesn't mess things up. Instead we want to focus on the word similarity. 
# Some sentences are length 0 as in no description so to avoid dividing by 0 just add a 1 and the resulting sum will be 0
title_2_vector_sums = []
for i in range(len(title_2)):
    vec = []
    for word in title_2[i]:
        vec.append(model_title.wv[word])
    if len(vec) > 0:
        title_2_vector_sums.append(sum(vec)/len(vec))
    else:
        title_2_vector_sums.append(sum(vec)/(len(vec)+1))

In [51]:
test_t2np = np.asarray(title_2_vector_sums)

In [52]:
len(test_t2np)

215438

In [53]:
np.save('test_t2np.npy' , test_t2np)

## Prepare Features

In [4]:
df.keys()

Index(['price_id1', 'bedrooms_id1', 'bathrooms_id1', 'totalArea_id1',
       'price_id2', 'bedrooms_id2', 'bathrooms_id2', 'totalArea_id2',
       'apartment_dummy_1', 'house_dummy_1', 'plot_dummy_1',
       'investment_dummy_1', 'other_dummy_1', 'apartment_dummy_2',
       'house_dummy_2', 'plot_dummy_2', 'investment_dummy_2', 'other_dummy_2',
       'target', 'title_id1', 'title_id2', 'description_id1',
       'description_id2'],
      dtype='object')

### Numeric Features (differences)

In [5]:
price_difference = np.abs(df['price_id1'] - df['price_id2'])
bedroom_difference = np.abs(df['bedrooms_id1'] - df['bedrooms_id2'])
bathroom_difference = np.abs(df['bathrooms_id1'] - df['bathrooms_id2'])
area_difference = np.abs(df['totalArea_id1'] - df['totalArea_id2'])
apartment_dummy_difference = np.abs(df['apartment_dummy_1'] - df['apartment_dummy_2'])
house_dummy_difference = np.abs(df['house_dummy_1'] - df['house_dummy_2'])
plot_dummy_difference = np.abs(df['plot_dummy_1'] - df['plot_dummy_2'])
investment_dummy_difference = np.abs(df['investment_dummy_1'] - df['investment_dummy_2'])
other_dummy_difference = np.abs(df['other_dummy_1'] - df['other_dummy_2'])

###  Cosine Similarity

In [6]:
from numpy import dot
from numpy.linalg import norm

In [7]:
# Description columns


test_d1np = np.load('test_d1np.npy')
test_d2np = np.load('test_d2np.npy')

#Calculate cosine similarity
test_description_cos_similarity = []
for i in range(len(test_d1np)):
        test_description_cos_similarity.append(np.dot(test_d1np[i],test_d2np[i])/(norm(test_d1np[i])*norm(test_d2np[i])))

In [8]:
# Title columns

test_t1np = np.load('test_t1np.npy')
test_t2np = np.load('test_t2np.npy')


# Calculate cosine similarity
test_title_cos_similarity = []
for i in range(len(test_t1np)):
        test_title_cos_similarity.append(np.dot(test_t1np[i],test_t2np[i])/(norm(test_t1np[i])*norm(test_t2np[i])))

In [9]:
features = pd.DataFrame()

In [10]:
features['price_difference'] = price_difference
features['bedroom_difference'] = bedroom_difference
features['bathroom_difference'] = bathroom_difference
features['area_difference'] = area_difference
features['apartment_dummy_difference'] = apartment_dummy_difference
features['house_dummy_difference'] = house_dummy_difference
features['plot_dummy_difference'] = plot_dummy_difference
features['investment_dummy_difference'] = investment_dummy_difference
features['other_dummy_difference'] = other_dummy_difference

In [11]:
features['description_cos_similarity'] = test_description_cos_similarity
features['title_cos_similarity'] = test_title_cos_similarity

In [12]:
features['target']  = df['target']

In [13]:
features.shape

(215438, 12)

In [14]:
features.dtypes

price_difference               float64
bedroom_difference             float64
bathroom_difference            float64
area_difference                float64
apartment_dummy_difference     float64
house_dummy_difference         float64
plot_dummy_difference          float64
investment_dummy_difference    float64
other_dummy_difference         float64
description_cos_similarity     float64
title_cos_similarity           float64
target                           int64
dtype: object

In [15]:
X = features.iloc[:,:-1].values
y = features['target'].values

In [16]:
features.iloc[:,:-1]

Unnamed: 0,price_difference,bedroom_difference,bathroom_difference,area_difference,apartment_dummy_difference,house_dummy_difference,plot_dummy_difference,investment_dummy_difference,other_dummy_difference,description_cos_similarity,title_cos_similarity
0,26400.0,0.0,0.0,36.723658,0.0,0.0,0.0,0.0,0.0,0.469201,0.668713
1,36400.0,0.0,0.0,5.745617,0.0,0.0,0.0,0.0,0.0,0.706801,0.827872
2,10000.0,0.0,0.0,19.926298,0.0,0.0,0.0,0.0,0.0,0.401719,0.668713
3,10000.0,0.0,0.0,29.548720,0.0,0.0,0.0,0.0,0.0,0.550554,0.686827
4,10000.0,0.0,0.0,1.603020,0.0,0.0,0.0,0.0,0.0,0.548579,0.787015
...,...,...,...,...,...,...,...,...,...,...,...
215433,100000.0,0.0,0.0,107.000000,0.0,0.0,0.0,0.0,0.0,0.947415,0.413101
215434,82000.0,0.0,0.0,73.651286,0.0,0.0,0.0,0.0,0.0,0.097064,0.129680
215435,85000.0,0.0,0.0,0.754105,0.0,0.0,0.0,0.0,0.0,-0.106588,0.362659
215436,20000.0,0.0,0.0,10.000000,0.0,0.0,0.0,0.0,0.0,0.491197,0.716594


## Apply HistGradient Boosting Model

In [17]:
 # explicitly require this experimental feature
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
# now you can import normally from ensemble
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import confusion_matrix
import pickle

In [18]:
loaded_model = pickle.load(open('hist_gradient_boosting_finalized_model.sav', 'rb'))

In [19]:
loaded_model

HistGradientBoostingClassifier(l2_regularization=0.0, learning_rate=0.1,
                               loss='binary_crossentropy', max_bins=255,
                               max_depth=None, max_iter=2000, max_leaf_nodes=31,
                               min_samples_leaf=20, n_iter_no_change=None,
                               random_state=None, scoring=None, tol=1e-07,
                               validation_fraction=0.1, verbose=0,
                               warm_start=False)

In [20]:
y_pred = loaded_model.predict(X)

In [21]:
c = confusion_matrix(y,y_pred)
correct_cases = ((c[0][0] + c[1][1])/ (c[0][0] + c[1][1] + c[1][0] + c[0][1]))*100
print("% of correct cases predicted is " + str(correct_cases))

% of correct cases predicted is 81.15513512008096
