In [None]:
#!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
#!gzip -d /content/GoogleNews-vectors-negative300.bin.gz

In [None]:
import os
import pandas as pd
from gensim.models import KeyedVectors
import utils
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import matplotlib.cm as cm

In [None]:
## Initialize Word2Vec model
google_word2vec = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin' ,binary=True)

## Define Topics

In [None]:
topics = sorted(["condition",
"lighting",
"customer service",
"trash",
"cleanliness",
"packages",
"noise",
"grounds",
"landscaping",
"maintenance",
"financial",
"move",
"pests",
"covid",
"apartment",
"community",
"security"])

In [None]:
# Define some predefined keywords for each topic
predefined_keywords = {"customer service":["team","staff","crew","communication","attitude"],
"security":["emergency","locked down","locked","parking"],
"landscaping" :["landscapes", "place"],
"maintenance":["location","maintain","maintained"],
"trash":["recycling"],
"covid": ["panademic","mask"],
"apartment":["carpet","appliances"],
"community":["neighbour","neighbours","society"],
"financial": ["affordable","money","fees","salary"]
}

In [None]:
topics_vs_keywords = {key : {key} for key in topics}

# add the predefined keywords to the original dict
for key,values in predefined_keywords.items():
  for v in values:
    topics_vs_keywords[key].add(v)


In [None]:
## Read data files

In [None]:
df_frames = []
for file in os.listdir("./data"):
    if file.endswith(".csv"):
        df_frames.append(pd.read_csv(f"./data/{file}"))

data = pd.concat(df_frames) 
data.head()

In [None]:
## remove nulls from dataset
filtered_data = data[ (data["ACTUAL REVIEW"].notnull()) & (data["ACTUAL REVIEW"]!=u'') ]
print(f"Size of dataset: {len(filtered_data)}")

### Data preprocessing and preparation

In [None]:
filtered_data["processed_review"] = filtered_data["ACTUAL REVIEW"].apply(lambda x: utils.detect_language_and_translate(x.lower()))

filtered_data["clean_review"] = filtered_data["processed_review"].apply(lambda x: utils.preprocess_text(x,True))

filtered_data["clean_actual_review"] = filtered_data["processed_review"].apply(lambda x: utils.preprocess_text(x,False))

In [None]:
text = filtered_data["clean_review"].values
wc_toxic = WordCloud(background_color="black",stopwords=  utils.stopwords)
wc_toxic.generate(" ".join(text))
plt.figure(figsize=(20,10))
plt.axis("off")
plt.title("frequent words in reviews", fontsize=20)
plt.imshow(wc_toxic.recolor(colormap= 'viridis' , random_state=17), alpha=0.98)
plt.show()

In [None]:
topics_vs_keywords = utils.extract_keywords(filtered_data, topics, google_word2vec, topics_vs_keywords)