In [51]:
import pandas as pd
import os
import numpy as np
import re
import sys
from matplotlib import pyplot as plt
import time

from gensim.models import Word2Vec

In [28]:
def load_data_df():

    def preprocess(i, text):
        text = re.sub("<br />", "", text)
        text = re.sub("[^A-Za-z0-9 .\-']", "", text)
        text = re.sub("\.", ". ", text)
        text = re.sub("\s+", " ", text)
        percent = i/data_n*100
        if i%100 == 0:
            sys.stdout.write("\r% 5.2f%%"%(percent))
        return text.lower()

    print("[load_data_df] Loading and preprocessing data...")
    
    df = pd.read_csv("./data/master.csv",
                     encoding="ISO8859",
                     index_col=0)
    df = df.drop(["file"], axis=1)
    
    data_n = len(df)
    review_se = df["review"]
    reviews = pd.Series(
        [preprocess(i, review)
         for i, review in enumerate(review_se)]
    )
    df["review"] = reviews
    sys.stdout.write("\r% 5.2f%%"%(100))
    
    return df


def get_toy_df(df, div_n, seed):

    np.random.seed(seed)
    
    tr_df = df[df["type"] == "train"]
    te_df = df[df["type"] == "test"]

    tr_n, te_n = len(tr_df)//div_n, len(te_df)//div_n
    tr_idxs = np.random.choice(tr_df.index, tr_n, replace=False)
    te_idxs = np.random.choice(te_df.index, te_n, replace=False)
    
    tr_df = tr_df.loc[tr_idxs]
    te_df = te_df.loc[te_idxs]
    df = pd.concat([te_df, tr_df])
    
    return df


def _plot_word22d(p, model):
    
    words = model.wv.index2word
    pts = np.array([model.wv[word] for word in words])
    T_pts = pts.T

    p.scatter(T_pts[0], T_pts[1])
    for i, word in enumerate(words):
        p.annotate(word, pts[i])
        
    origin = [0, 0]
    p.scatter(origin[0], origin[1], s=100)
    p.annotate("ORIGIN", origin, fontsize=14)
    
    mean_pt = pts.mean(axis=0)
    p.scatter(mean_pt[0], mean_pt[1], s=100)
    p.annotate("MEAN", mean_pt, fontsize=14)
    
    if "_" in words:
        kernel = model.wv["_"]
        p.scatter(kernel[0], kernel[1], c="red", s=100)
        p.annotate("UNDERSCORE", kernel, fontsize=14)
        
        
def report_underscore(model1, model2):

    plt.figure(figsize=[12, 6])

    p1 = plt.axes([0, 0, 0.48, 1])
    p2 = plt.axes([0.54, 0, 0.48, 1])
    p2.get_yaxis().set_visible(False)

    _plot_word22d(p1, model1)
    _plot_word22d(p2, model2)

    plt.show()

In [3]:
df = load_data_df()

[load_data_df] Loading and preprocessing data...
 100.00%

In [55]:
div_n = 1

size = 2
window = 5
min_count = 1
workers = os.cpu_count()

toy_df = get_toy_df(df, div_n, 50554145)
sent_se = toy_df["review"]

In [56]:
start = time.time()

sents = [sent.split() for sent in sent_se]
model1 = Word2Vec(
    sents, size=2, window=window, min_count=min_count, workers=workers
)

time.time()-start

60.431156158447266

In [57]:
start = time.time()

sents = [("_ "+" _ ".join(sent.split())+" _").split()
         for sent in sent_se]
model2 = Word2Vec(
    sents, size=2, window=window, min_count=min_count, workers=workers
)

time.time()-start

82.98703861236572

In [60]:
if div_n >= 1000:
    report_underscore(model1, model2)
    
print(len(model1.wv.index2word))
print(len(model2.wv.index2word))

model1.save("data/normal_wv")
model2.save("data/underscore_wv")

331511
331512
