In [None]:
pip install -U nltk

In [None]:
nltk.download('punkt')

In [1]:
# prepare_vocab for yelp_review sentiment classification dataset
import nltk
from nltk.tag import StanfordNERTagger
from nltk.tag.stanford import StanfordPOSTagger
import json
import pandas as pd
from tqdm import tqdm 


In [2]:
Yahoo_data = pd.read_csv('dataset/yelp_review_small/test.csv', header=None)
Yahoo_data.head()

Unnamed: 0,0,1
0,1,I got 'new' tires from them and within two wee...
1,1,Don't waste your time. We had two different p...
2,1,All I can say is the worst! We were the only 2...
3,1,I have been to this restaurant twice and was d...
4,1,Food was NOT GOOD at all! My husband & I ate h...


In [7]:
"""
Create Train and Test Files for the yelp_review sentiment classification experiment
"""
trainFile = 'dataset/yelp_review_small/train.csv'
testFile = 'dataset/yelp_review_small/test.csv'

# # load standford POS tagger
# path_to_model_pos = "/data2/Stanford_tools/stanford-postagger-2018-02-27/models/english-bidirectional-distsim.tagger"
# path_to_jar_pos = "/data2/Stanford_tools/stanford-postagger-2018-02-27/stanford-postagger.jar"
# pos_tagger=StanfordPOSTagger(path_to_model_pos, path_to_jar_pos)
# # pos_tagger.java_options='-mx4096m'          ### Setting higher memory limit for long sentences


# load standford NER tagger
# path_to_model_ner = '/data2/Stanford_tools/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz'
# path_to_jar_ner = '/data2/Stanford_tools/stanford-ner-2017-06-09/stanford-ner.jar'
# ner_tagger = StanfordNERTagger(path_to_model_ner,path_to_jar_ner,  encoding='utf-8')

def createFile(filepath, outputpath):
    Yahoo_data = pd.read_csv(filepath, header=None)
    labels = Yahoo_data.iloc[:,0].tolist()
    samples = Yahoo_data.iloc[:,1].tolist()  
    
    fOut = open(outputpath, 'w')
    data_json = []

    maxLen_Sentence = 0
    num_Sentence = 0
    
    def preprocess_doc(document):
        document = document.replace(u'\\n',u'\n')
        document = document.replace(u'\\r',u'\r')
        document = document.replace(u'\\"',u'\"')
        document = document.replace("<PAD>", " _PAD_ ")
        return document

    for idx in range(0, len(samples)):
        num_Sentence += 1
        
        data_dict = {}
        label = labels[idx]
        sentence = preprocess_doc(samples[idx])
        tokens = nltk.word_tokenize(sentence)
    
        data_dict['label'] = label
        data_dict['token'] = tokens
        
#         # NER tagging
#         tokens_ner = ner_tagger.tag(tokens)
        # POS tagging
#         tokens_pos = pos_tagger.tag(tokens)
        
#         data_dict['stanford_pos'] = [pos[1] for pos in tokens_pos]
#         data_dict['stanford_ner'] = [ner[1] for ner in tokens_ner]
        
        if len(tokens) > maxLen_Sentence:
            maxLen_Sentence = len(tokens)

        data_json.append(data_dict)
        
    with open(outputpath, 'w') as outfile:  
        json.dump(data_json, outfile)
    print("Number of sentences is", num_Sentence)
    print("Max Length of sentences is", maxLen_Sentence)
    

print("Creating training data...")
createFile(trainFile, "dataset/yelp_review_small/train_processed.json")
print("\nCreating test data...")
createFile(testFile, "dataset/yelp_review_small/test_processed.json")
print("Train / Test file created")

Creating training data...
Number of sentences is 650000
Max Length of sentences is 1228

Creating test data...
Number of sentences is 50000
Max Length of sentences is 1157
Train / Test file created


In [8]:
"""
Prepare vocabulary and initial word vectors.
"""
import json
import pickle
import argparse
import numpy as np
from collections import Counter

from utils import vocab, constant, helper


def parse_args():
    parser = argparse.ArgumentParser(description='Prepare vocab for text classification.')
    parser.add_argument('--data_dir', default='dataset/yelp_review_small', help='TACRED directory.')
    parser.add_argument('--vocab_dir', default='dataset/yelp_review_small', help='Output vocab directory.')
    parser.add_argument('--glove_dir', default='dataset', help='GloVe directory.')
    parser.add_argument('--wv_file', default='glove.840B.300d.txt', help='GloVe vector file.')
    parser.add_argument('--wv_dim', type=int, default=300, help='GloVe vector dimension.')
    parser.add_argument('--min_freq', type=int, default=0, help='If > 0, use min_freq as the cutoff.')
    parser.add_argument('--lower', action='store_true', help='If specified, lowercase all words.')
    
    args = parser.parse_args(args=[])
    return args
    

def load_tokens(filename):
    with open(filename) as infile:
        data = json.load(infile)
        tokens = []
        for d in data:
            tokens += d['token']
    print("{} tokens from {} examples loaded from {}.".format(len(tokens), len(data), filename))
    return tokens


def build_vocab(tokens, glove_vocab, min_freq):
    """ build vocab from tokens and glove words. """
    counter = Counter(t for t in tokens if t != '_PAD_')
    # if min_freq > 0, use min_freq, otherwise keep all glove words
    if min_freq > 0:
        v = sorted([t for t in counter if counter.get(t) >= min_freq], key=counter.get, reverse=True)
    else:
        v = sorted([t for t in counter if t in glove_vocab], key=counter.get, reverse=True)
    # add special tokens and entity mask tokens
    v = constant.VOCAB_PREFIX + v
    print("vocab built with {}/{} words.".format(len(v), len(counter)))
    return v


def count_oov(tokens, vocab):
    c = Counter(t for t in tokens)
    total = sum(c.values())
    matched = sum(c[t] for t in vocab)
    return total, total-matched

In [9]:
args = parse_args()
args.lower = False
print(args)

Namespace(data_dir='dataset/yelp_review_small', glove_dir='/data2/pengfei_data/data', lower=False, min_freq=0, vocab_dir='dataset/yelp_review_small', wv_dim=300, wv_file='glove.840B.300d.txt')


In [10]:
# input files
train_file = args.data_dir + '/train_processed.json'
# dev_file = args.data_dir + '/dev.json'
test_file = args.data_dir + '/test_processed.json'
wv_file = args.glove_dir + '/' + args.wv_file
wv_dim = args.wv_dim

# output files
helper.ensure_dir(args.vocab_dir)
vocab_file = args.vocab_dir + '/vocab.pkl'
emb_file = args.vocab_dir + '/embedding.npy'


In [11]:
# load files
print("loading files...")
train_tokens = load_tokens(train_file)
# dev_tokens = load_tokens(dev_file)
test_tokens = load_tokens(test_file)
if args.lower:
    train_tokens, test_tokens = [[t.lower() for t in tokens] for tokens in (train_tokens, test_tokens)]
    
# load glove
print("loading glove...")
glove_vocab = vocab.load_glove_vocab(wv_file, wv_dim)
print("{} words loaded from glove.".format(len(glove_vocab)))

loading files...
101170944 tokens from 650000 examples loaded from dataset/yelp_review_small/train_processed.json.
7795550 tokens from 50000 examples loaded from dataset/yelp_review_small/test_processed.json.
loading glove...
2195892 words loaded from glove.


In [12]:
print("building vocab...")
v = build_vocab(train_tokens, glove_vocab, args.min_freq)

print("calculating oov...")
datasets = {'train': train_tokens, 'test': test_tokens}
for dname, d in datasets.items():
    total, oov = count_oov(d, v)
    print("{} oov: {}/{} ({:.2f}%)".format(dname, oov, total, oov*100.0/total))

building vocab...
vocab built with 230902/508651 words.
calculating oov...
train oov: 594450/101170944 (0.59%)
test oov: 52038/7795550 (0.67%)


In [13]:
print("building embeddings...")
embedding = vocab.build_embedding(wv_file, v, wv_dim)
print("embedding size: {} x {}".format(*embedding.shape))

print("dumping to files...")
with open(vocab_file, 'wb') as outfile:
    pickle.dump(v, outfile)
np.save(emb_file, embedding)
print("all done.")

building embeddings...
embedding size: 230902 x 300
dumping to files...
all done.
