In [35]:
import pandas as pd
import random
import os
import numpy as np
from sklearn.model_selection import train_test_split
import re

In [36]:
seed = 2022

In [37]:
def seed_everything(seed):
    """
    set seed
    :param seed:
    :return:
    """
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

In [38]:
# sentences tokenizer
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|Mt)[.]"
websites = "[.](com|net|org|io|gov|me|edu)"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
digits = "([0-9])"

In [39]:
def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(digits + "[.]" + digits, "\\1<prd>\\2", text)
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    # if "..." in text: text = text.replace("...", "<prd>")
    if "e.g." in text: text = text.replace("e.g.", "e<prd>g<prd>")
    if "i.e." in text: text = text.replace("i.e.", "i<prd>e<prd>")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    text = text.replace("<br/>","\n")
    text = text.replace("<br />","\n")
    sentences = text.split("<stop>")
    res = []
    for sen in sentences:
        if len(sen) > 2:
            res.append(sen.strip())
    return res

In [40]:
sentences = split_into_sentences("This screen protector is a great value and very well made.<br /><p />I am 5'9'' so far, but this one does not look like that big of an improvement from previous generations...it just feels more solid than before with no scratches or anything else on either side.")
print(sentences)

['This screen protector is a great value and very well made.', "<p />I am 5'9'' so far, but this one does not look like that big of an improvement from previous generations.", 'it just feels more solid than before with no scratches or anything else on either side.']


In [41]:
data_path_80 = "amazon_reviews_80.txt"
data_path_20 = "amazon_reviews_20.txt"


columns = [
        'DOC_ID',
        'LABEL',
        'RATING',
        'VERIFIED_PURCHASE',
        'PRODUCT_CATEGORY',
        'PRODUCT_ID',
        'PRODUCT_TITLE',
        'REVIEW_TITLE',
        'REVIEW_TEXT'
    ]

df_full_train = pd.read_csv(data_path_80, sep='\t', header=None, names=columns, skiprows=1)
df_test = pd.read_csv(data_path_20, sep='\t', header=None, names=columns, skiprows=1)

In [42]:
df_test.head()

Unnamed: 0,DOC_ID,LABEL,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_ID,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT
0,20982,__label2__,5,Y,Shoes,B00APWMC90,"Vasque Women's Skadia Ultradry Snow Boot,Jet B...",Great boot,Got these just in time to hike on snow-covered...
1,9680,__label1__,5,Y,Jewelry,B00R47VS94,Moonar® 7pcs Gold Skull Bowknot Heart Design M...,"Pretty, Affordable Rings...!",I really love putting some accessories in my f...
2,4740,__label1__,5,Y,PC,B008THTWIW,"iPad Mini Screen Protector, Tech Armor Anti-Gl...",Very easy to use and the best pricing online!,I highly recommend the new Tech Armor Anti-Fin...
3,8597,__label1__,5,N,Tools,B002JM15Z6,Chef Works CSBA-BCS Chalk Stripe Bib Apron wit...,Simple but high quality,I really like this apron it has a very simple ...
4,11535,__label2__,5,N,Outdoors,B00RJM2HTE,Mountain House Chicken Fried Rice,Good Stuff!,This Mountain House &#34;Savor the Adventure&#...


In [43]:
train_valid_ratio = 7/9
seed_everything(seed)
df_train, df_valid = train_test_split(df_full_train, train_size = train_valid_ratio, random_state = 1)

In [44]:
special_tokens = {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]",
                      "mask_token": "[MASK]",
                      "bos_token": "[BOS]", "eos_token": "[EOS]"}

In [45]:
def build_dataset(df, dest_path):
    f = open(dest_path, 'w')
    data = ''
    reviews = df['REVIEW_TEXT'].tolist()
    for review in reviews:
        review = str(review).strip()
        sent_tokens = split_into_sentences(review)
        data += special_tokens['bos_token'] + sent_tokens[0] \
                + special_tokens['sep_token'] + sent_tokens[-1] \
                + special_tokens['sep_token'] + review \
                + special_tokens['eos_token'] + '\n'
        
    f.write(data)

In [46]:
build_dataset(df_train, 'train.txt')
build_dataset(df_valid, 'valid.txt')
build_dataset(df_test, 'test.txt')