# Preprocessing

In [3]:
df = df[df["label"] != "unsup"]



def preprocess(df):
    
    def process(i, text):
        text = re.sub("<br />", " ", text)
        text = re.sub("[^A-Za-z0-9 .\-']", "", text)
        text = re.sub("\d+", "00", text)
        text = re.sub("-", " ", text)
        text = re.sub("\.", " ", text)
        text = re.sub("\s+", " ", text)
        if i%100 == 0:
            percent = i/data_n*100
            sys.stdout.write("\r% 5.2f%%"%(percent))
        return text.lower()
    
    data_n = len(df)
    review_se = df["review"]
    
    print("[load_data_df] Preprocessing data...")
    review_se = pd.Series(
        [process(i, review)
         for i, review in enumerate(review_se)]
    )
    sys.stdout.write("\r% 5.2f%%\n"%(100))
    
    df["review"] = review_se
    return df


def get_freq(df):

    data_n = len(df)
    review_se = df["review"]
    freq=defaultdict(int)

    print("[load_data_df] Calculating word frequency...")
    for i, sent in enumerate(review_se):
        for word in sent.split():
            freq[word] += 1
        if i%100 == 0:
            percent = i/data_n*100
            sys.stdout.write("\r% 5.2f%%"%(percent))
    sys.stdout.write("\r% 5.2f%%\n"%(100))
    
    return freq

def get_unknown_df(df, freq, min_count):
    
    def check8convert(sent):
        text_list = [
            "UNKNOWN" if freq[word] < min_count else word
             for word in sent.split()
        ]
        return text_list
    
    df = df.copy()
    df["review"] = pd.Series([
        check8convert(sent) for sent in df["review"]
    ])
    
    return df

def get_normal_df(df):
    df = df.copy()
    df["review"] = pd.Series([
        [word for word in words if word != "UNKNOWN"]
        for words in df["review"]
    ])
    return df

def get_underscore_df(df):
    df = df.copy()
    df["review"] = pd.Series([
        ("_ "+" _ ".join(words)+" _").split()
        for words in df["review"]
    ])                
    return df

In [4]:
df = load_data_df("./data/master.csv")
df = preprocess(df)
freq = get_freq(df)

min_count = 10

unknown_df = get_unknown_df(df, freq, min_count)
underscore_unknown_df = get_underscore_df(unknown_df)

normal_df = get_normal_df(unknown_df)
normal_df.to_pickle("./data/normal_df")
underscore_df = get_underscore_df(normal_df)
underscore_df.to_pickle("./data/underscore_df")

[load_data_df] Preprocessing data...
 100.00%
[load_data_df] Calculating word frequency...
 100.00%


In [6]:
case_index = 1
case = ["report", "embedding"][case_index]

normal_window = 5
underscore_window = normal_window*2
workers = os.cpu_count()

if case == "report":
    size = 2
    data_ns = [5, 10, 50, 100, 500, 1000]
    experimental_df = unknown_df
    control_df = underscore_unknown_df
elif case == "embedding":
    size = 128
    data_ns = [len(normal_df)]
    experimental_df = normal_df
    control_df = underscore_df

    
for data_n in data_ns:    

#     model1 = Word2Vec(experimental_df["review"][:data_n],
#                       size=size,
#                       window=normal_window,
#                       min_count=min_count,
#                       workers=workers,
#                       iter=10)

    model2 = Word2Vec(control_df["review"][:data_n],
                      size=size,
                      window=underscore_window,
                      min_count=min_count,
                      workers=workers,
                      iter=15)
    
    if case == "report":
        report_underscore(model1, model2)
    elif case == "embedding":
#         model1.save("./data/normal_model")
        model2.save("./data/underscore_model")