In [2]:
import pandas as pd
import os
import numpy as np
import re
import sys
from matplotlib import pyplot as plt
import time
from collections import defaultdict

from gensim.models import Word2Vec

In [3]:
def _plot_word22d(p, model):
    
    words = model.wv.index2word
    pts = np.array([model.wv[word] for word in words])
    T_pts = pts.T

    p.scatter(T_pts[0], T_pts[1])
    for i, word in enumerate(words):
        p.annotate(word, pts[i])
        
    origin = [0, 0]
    p.scatter(origin[0], origin[1], s=100)
    p.annotate("ORIGIN", origin, fontsize=14)
    
    mean_pt = pts.mean(axis=0)
    p.scatter(mean_pt[0], mean_pt[1], s=100)
    p.annotate("MEAN", mean_pt, fontsize=14)
    
    if "_" in words:
        kernel = model.wv["_"]
        p.scatter(kernel[0], kernel[1], c="red", s=100)
        p.annotate("UNDERSCORE", kernel, fontsize=14)
        
    if "UNKNOWN" in words:
        unknown = model.wv["UNKNOWN"]
        p.scatter(unknown[0], unknown[1], c="pink", s=100)
        p.annotate("UNKNOWN", unknown, fontsize=14)
        
        
def report_underscore(model1, model2):

    plt.figure(figsize=[12, 6])

    p1 = plt.axes([0, 0, 0.48, 0.96])
    _plot_word22d(p1, model1)

    p2 = plt.axes([0.54, 0, 0.48, 0.96])
    _plot_word22d(p2, model2)

    p3 = plt.axes([0, 0.98, 1, 0.04])
    
    word_n = len(model2.wv.index2word)
    title = "Number of words: %d"%word_n
    p3.text(0.4, 0, title, fontsize=20)
    p3.set_axis_off()
    
    plt.show()

In [4]:
def load_data_df(path):
    df = pd.read_csv(path,
                     encoding="ISO8859",
                     index_col=0)
    df = df[df["label"] != "unsup"]
    return df


def preprocess(df):
    
    def process(i, text):
        text = re.sub("<br />", " ", text)
        text = re.sub("[^A-Za-z0-9 .\-']", "", text)
        text = re.sub("\d+", "00", text)
        text = re.sub("-", " ", text)
        text = re.sub("\.", " ", text)
        text = re.sub("\s+", " ", text)
        if i%100 == 0:
            percent = i/data_n*100
            sys.stdout.write("\r% 5.2f%%"%(percent))
        return text.lower()
    
    data_n = len(df)
    review_se = df["review"]
    
    print("[load_data_df] Preprocessing data...")
    review_se = pd.Series(
        [process(i, review)
         for i, review in enumerate(review_se)]
    )
    sys.stdout.write("\r% 5.2f%%\n"%(100))
    
    df["review"] = review_se
    return df


def get_freq(df):

    data_n = len(df)
    review_se = df["review"]
    freq=defaultdict(int)

    print("[load_data_df] Calculating word frequency...")
    for i, sent in enumerate(review_se):
        for word in sent.split():
            freq[word] += 1
        if i%100 == 0:
            percent = i/data_n*100
            sys.stdout.write("\r% 5.2f%%"%(percent))
    sys.stdout.write("\r% 5.2f%%\n"%(100))
    
    return freq

def get_unknown_df(df, freq, min_count):
    
    def check8convert(sent):
        text_list = [
            "UNKNOWN" if freq[word] < min_count else word
             for word in sent.split()
        ]
        return text_list
    
    df = df.copy()
    df["review"] = pd.Series([
        check8convert(sent) for sent in df["review"]
    ])
    
    return df

def get_normal_df(df):
    df = df.copy()
    df["review"] = pd.Series([
        [word for word in words if word != "UNKNOWN"]
        for words in df["review"]
    ])
    return df

def get_underscore_df(df):
    df = df.copy()
    df["review"] = pd.Series([
        ("_ "+" _ ".join(words)+" _").split()
        for words in df["review"]
    ])                
    return df

In [5]:
df = load_data_df("./data/master.csv")
df = preprocess(df)
freq = get_freq(df)

min_count = 10

unknown_df = get_unknown_df(df, freq, min_count)
underscore_unknown_df = get_underscore_df(unknown_df)

normal_df = get_normal_df(unknown_df)
normal_df.to_pickle("./data/normal_df")
underscore_df = get_underscore_df(normal_df)
underscore_df.to_pickle("./data/underscore_df")

[load_data_df] Preprocessing data...
 100.00%
[load_data_df] Calculating word frequency...
 100.00%


In [88]:
case_index = 1
case = ["report", "embedding"][case_index]

normal_window = 5
underscore_window = normal_window*2
workers = os.cpu_count()

if case == "report":
    size = 2
    data_ns = [5, 10, 50, 100, 500, 1000]
    experimental_df = unknown_df
    control_df = underscore_unknown_df
elif case == "embedding":
    size = 128
    data_ns = [len(normal_df)]
    experimental_df = normal_df
    control_df = underscore_df

    
for data_n in data_ns:    

    model1 = Word2Vec(experimental_df["review"][:data_n],
                      size=size,
                      window=normal_window,
                      min_count=min_count,
                      workers=workers,
                    )

    model2 = Word2Vec(control_df["review"][:data_n],
                      size=size,
                      window=underscore_window,
                      min_count=min_count,
                      workers=workers)
    
    if case == "report":
        report_underscore(model1, model2)
    elif case == "embedding":
        model1.save("./data/normal_model")
        model2.save("./data/underscore_model")