# Notebook to build platform-specific dictionaries

In [None]:
import pandas as pd
import nltk
import itertools
import gensim

from scripts import preprocess_text

### Importing descriptions

In [None]:
descriptions = pd.read_csv("../../data/airbnb_listings_description/london_listings_description_ward.csv")[['full_description','ward']]
descriptions.head()

In [None]:
ward_count = pd.DataFrame(descriptions['ward'].value_counts()).reset_index().rename(columns={"index": "ward", "ward": "count"})
ward_count = ward_count[ward_count['count'] < 5].reset_index(drop=True)
ethic_wards = ward_count['ward'].tolist()

rows = []
for i in range(descriptions.shape[0]):
    if (descriptions['ward'][i] in ethic_wards):
        rows.append(i)
descriptions = descriptions.drop(rows).reset_index(drop=True)

### Helper functions

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
punctuation = ['.',',',')','(','!',':',';']
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)
def verifyWord(word):
    return (word not in stopwords) and (word not in punctuation) and (not hasNumbers(word))
dict_filter = lambda word_freq, stopwords: dict((word,word_freq[word]) for word in word_freq if verifyWord(word))

### Get the top 150 tokens

In [None]:
doc = ""
for i in range(descriptions.shape[0]):
    doc += descriptions['full_description'][i]
    doc += " "
doc = doc[:-1]

In [None]:
doc = preprocess_text(doc)
tokens = nltk.word_tokenize(doc)

In [None]:
word_freq = nltk.FreqDist(tokens)
filtered_word_freq = dict_filter(word_freq, stopwords)

In [None]:
top150 = [item[0] for item in list(itertools.islice(filtered_word_freq.items(), 0, 150))]
print(top150)

### Train Word2Vec on the data

In [None]:
# Cleaning and tokenizing the descriptions
descriptions_clean = descriptions.copy()
for i in range(descriptions.shape[0]):
    val = [nltk.word_tokenize(s) for s in nltk.sent_tokenize(preprocess_text(descriptions['full_description'][i]))]
    descriptions_clean['full_description'][i] = val
descriptions_clean.head()

In [None]:
# Getting the corpus of sentences
corpus = []
for desc in range(descriptions_clean.shape[0]):
    sentences = descriptions_clean['full_description'][desc]
    for i in sentences:
        corpus.append(i)

In [None]:
word2vec_model = gensim.models.Word2Vec(sentences=corpus, window=3, min_count=5, sg=1, iter=30)

### Expand the dictionary

In [None]:
expansion = []
for key in top150:
    similarities = word2vec_model.wv.most_similar(key, topn=None)
    for i in range(similarities.shape[0]):
        if (similarities[i] > 0.75):
            val = word2vec_model.wv.index2word[i]
            if (val not in top150 and val not in expansion and verifyWord(val)):
                expansion.append(val)
print(expansion)

### Output the dictionary to .csv

In [None]:
dict_expanded = top150 + expansion

In [None]:
df_expanded = pd.DataFrame(data={"tokens": dict_expanded})
df_expanded.to_csv("../../data/dictionary/london.csv",index=False)