# Word2Vec for Title Columns

Goal: Exact same outline as Word2Vec_Description.ipynb except this time we will create a new Word2Vec model to learn on the title columns

Outline:
* Similar to Word2Vec_Description.ipynb train on the title columns from original dataset. Then map it to our cleaned dataset (EDA_and_Data_Cleaning.ipynb)

In [1]:
import pandas as pd
import re
import numpy as np
import nltk
from gensim.models import Word2Vec
from nltk.corpus import stopwords
import gensim
from collections import Counter
stop_words = stopwords.words('english')
stopwords_dict = Counter(stop_words)

## Build Model

In [2]:
cols = ['title_id1' , 'title_id2']
df = pd.read_csv('cleaned_data.csv' , usecols=cols)

In [3]:
df.head()

Unnamed: 0,title_id1,title_id2
0,Exquisite manor house amidst the Tramontana va...,Finca in Puigpunyent (Objektnummer KSV00142)
1,Magnificent Mallorquinian Mansion of XVII cent...,Finca in Puigpunyent (Objektnummer KSV00142)
2,Magnificent Mallorquinian Mansion of XVII cent...,Finca in Puigpunyent (Objektnummer KSV00142)
3,Mallorquinian Mansion of XVII century on the m...,Finca in Puigpunyent (Objektnummer KSV00142)
4,Unique Majorcan Rural Estate in Puigpunyent,Finca in Puigpunyent (Objektnummer KSV00142)


In [4]:
# Clean string data for title_id1 column
title_1 = []
for i in df['title_id1']:
    title_1.append(re.sub(r'\W+', ' ', i.lower()))

In [5]:
# Tokenize words in title_1
title_1 = [nltk.word_tokenize(sentence) for sentence in title_1]

In [6]:
# Remove stopwords
for i in range(len(title_1)):
    title_1[i] = [word for word in title_1[i] if word not in stopwords_dict]

In [7]:
# Clean string data for title_id2 column
title_2 = []
for i in df['title_id2']:
    title_2.append(re.sub(r'\W+', ' ', i.lower()))

In [8]:
# Tokenize words in title_2
title_2 = [nltk.word_tokenize(sentence) for sentence in title_2]

In [9]:
# Remove stopwords
for i in range(len(title_2)):
    title_2[i] = [word for word in title_2[i] if word not in stopwords_dict]

In [10]:
title = title_1 + title_2

In [11]:
len(title)

1005378

In [12]:
model = Word2Vec(title, min_count=1)

In [13]:
model.save("TrainWord2vecTitle.model")

## Apply model on Title_id1 & Title_id2 columns from cleaned data set (EDA_and_Data_Cleaning.ipynb)

### Title_id1 Vectors

In [17]:
# Make sure to divide by len(vec) so sentence length doesn't mess things up. Instead we want to focus on the word similarity. 
# Some sentences are length 0 as in no description so to avoid dividing by 0 just add a 1 and the resulting sum will be 0
title_1_vector_sums = []
for i in range(len(title_1)):
    vec = []
    for word in title_1[i]:
        vec.append(model.wv[word])
    if len(vec) > 0:
        title_1_vector_sums.append(sum(vec)/len(vec))
    else:
        title_1_vector_sums.append(sum(vec)/(len(vec)+1))

In [18]:
t1np = np.asarray(title_1_vector_sums)

In [19]:
len(t1np)

502689

In [20]:
np.save('t1np.npy' , t1np)

## Title_id2 Vectors

In [21]:
# Make sure to divide by len(vec) so sentence length doesn't mess things up. Instead we want to focus on the word similarity. 
# Some sentences are length 0 as in no description so to avoid dividing by 0 just add a 1 and the resulting sum will be 0
title_2_vector_sums = []
for i in range(len(title_2)):
    vec = []
    for word in title_2[i]:
        vec.append(model.wv[word])
    if len(vec) > 0:
        title_2_vector_sums.append(sum(vec)/len(vec))
    else:
        title_2_vector_sums.append(sum(vec)/(len(vec)+1))

In [22]:
t2np = np.asarray(title_2_vector_sums)

In [23]:
len(t2np)

502689

In [24]:
np.save('t2np.npy' , t2np)