In [273]:
import pandas as pd
import os
import numpy as np
import re
import sys
from matplotlib import pyplot as plt
import time
from collections import defaultdict

from gensim.models import Word2Vec

In [274]:
def load_data_df(min_count):

    def preprocess(i, text):
        text = re.sub("<br />", " ", text)
        text = re.sub("[^A-Za-z0-9 .\-']", "", text)
        text = re.sub("\d+", "00", text)
        text = re.sub("-", " ", text)
        text = re.sub("\.", " ", text)
        text = re.sub("\s+", " ", text)
        if i%100 == 0:
            percent = i/data_n*100
            sys.stdout.write("\r% 5.2f%%"%(percent))
        return text.lower()
    
    def check8convert(i, text):
        text_list = ["UNKNOWN" if freq[word] < min_count else word
                     for word in text.split()]
        if i%100 == 0:
            percent = i/data_n*100
            sys.stdout.write("\r% 5.2f%%"%(percent))
        return " ".join(text_list)
    
    df = pd.read_csv("./data/master.csv",
                     encoding="ISO8859",
                     index_col=0)
    df = df.drop(["file"], axis=1)
    data_n = len(df)
    review_se = df["review"]
    freq=defaultdict(int)
    
    print("[load_data_df] Preprocessing data...")
    review_se = pd.Series(
        [preprocess(i, review)
         for i, review in enumerate(review_se)]
    )
    sys.stdout.write("\r% 5.2f%%\n"%(100))
    
    print("[load_data_df] Calculating word frequency...")
    for i, sent in enumerate(review_se):
        for word in sent.split():
            freq[word] += 1
        percent = i/data_n*100
        sys.stdout.write("\r% 5.2f%%"%(percent))
    sys.stdout.write("\r% 5.2f%%\n"%(100))
    
    print("[load_data_df] Converting small-freq-word to 'UNKNOWN'...")
    review_se = pd.Series([check8convert(i, sent) for i, sent
                           in enumerate(review_se)])                
    sys.stdout.write("\r% 5.2f%%\n"%(100))
    
    df["review"] = review_se
    
    return df


def get_toy_df(df, div_n, seed):

    np.random.seed(seed)
    
    tr_df = df[df["type"] == "train"]
    te_df = df[df["type"] == "test"]

    tr_n, te_n = len(tr_df)//div_n, len(te_df)//div_n
    tr_idxs = np.random.choice(tr_df.index, tr_n, replace=False)
    te_idxs = np.random.choice(te_df.index, te_n, replace=False)
    
    tr_df = tr_df.loc[tr_idxs]
    te_df = te_df.loc[te_idxs]
    df = pd.concat([te_df, tr_df])
    
    return df


def _plot_word22d(p, model):
    
    words = model.wv.index2word
    pts = np.array([model.wv[word] for word in words])
    T_pts = pts.T

    p.scatter(T_pts[0], T_pts[1])
    for i, word in enumerate(words):
        p.annotate(word, pts[i])
        
    origin = [0, 0]
    p.scatter(origin[0], origin[1], s=100)
    p.annotate("ORIGIN", origin, fontsize=14)
    
    mean_pt = pts.mean(axis=0)
    p.scatter(mean_pt[0], mean_pt[1], s=100)
    p.annotate("MEAN", mean_pt, fontsize=14)
    
    if "_" in words:
        kernel = model.wv["_"]
        p.scatter(kernel[0], kernel[1], c="red", s=100)
        p.annotate("UNDERSCORE", kernel, fontsize=14)
        
    if "UNKNOWN" in words:
        unknown = model.wv["UNKNOWN"]
        p.scatter(unknown[0], unknown[1], c="pink", s=100)
        p.annotate("UNKNOWN", unknown, fontsize=14)
        
        
def report_underscore(model1, model2, word_n):

    plt.figure(figsize=[12, 6])

    p1 = plt.axes([0, 0, 0.48, 0.96])
    _plot_word22d(p1, model1)

    p2 = plt.axes([0.54, 0, 0.48, 0.96])
    _plot_word22d(p2, model2)

    p3 = plt.axes([0, 0.98, 1, 0.04])
    
    title = "Number of words: %d"%word_n
    p3.text(0.4, 0, title, fontsize=20)
    p3.set_axis_off()
    
    plt.show()

In [275]:
min_count = 5
df = load_data_df(min_count)

size, window = 100, 10
workers = os.cpu_count()

div_n = 1
toy_df = get_toy_df(df, div_n, 1050554145)
sent_se = toy_df["review"]

[load_data_df] Preprocessing data...
 100.00%
[load_data_df] Calculating word frequency...
 100.00%
[load_data_df] Converting small-freq-word to 'UNKNOWN'...
 100.00%


In [277]:
sents = [sent.split() for sent in sent_se]
start = time.time()
model1 = Word2Vec(
    sents, size=size, window=window, min_count=min_count, workers=workers
)
print("Normal    :", time.time()-start)

sents = [("_ "+" _ ".join(sent.split())+" _").split()
         for sent in sent_se]
start = time.time()
model2 = Word2Vec(
    sents, size=size, window=window, min_count=min_count, workers=workers
)
print("Underscore:", time.time()-start)

if div_n >= 1000:
    report_underscore(model1, model2, len(model2.wv.index2word))

Normal    : 52.60772681236267
Underscore: 71.93054366111755


In [284]:
model1.save("./data/normal_wv")
model2.save("./data/underscore_wv")
pd.to_pickle(toy_df, "./data/preprocessed_df")