In [17]:
import re
import random
import os
import numpy as np
from collections import Counter
from sklearn.feature_extraction import stop_words
from nltk import ngrams
import pickle as pkl

In [18]:
imdb_home = 'aclImdb'

def load_imdb_data(path):
    
    pos_data, neg_data = [], []
    all_files = []
    #_limit = 250
    
    for dirpath, dirnames, files in os.walk(path):
        for name in files:
            all_files.append(os.path.join(dirpath, name))
            
            
    for file_path in all_files:
        if '/neg' in file_path:
            example = {}
            with open(file_path, 'r') as myfile:
                example['text'] = myfile.read().replace('\n', '')
            example['label'] = 0
            example['path'] = file_path

            neg_data.append(example)
            
        if '/pos' in file_path:
            example = {}
            with open(file_path, 'r') as myfile:
                example['text'] = myfile.read().replace('\n', '')
            example['label'] = 1
            example['path'] = file_path
            
            pos_data.append(example)
    return neg_data, pos_data


In [19]:
imdb_train_neg, imdb_train_pos = load_imdb_data(imdb_home+'/train/')
imdb_test_neg, imdb_test_pos = load_imdb_data(imdb_home+'/test/')

In [20]:
len(imdb_train_neg), len(imdb_train_pos)

(12500, 12500)

In [21]:
#split the train dataset into 20,000 train examples and 5,000 validation example
train_split = 10000
train_text = [imdb_train_neg[i]['text'] for i in range(train_split)]+[imdb_train_pos[i]['text'] for i in range(train_split)]
train_label = [imdb_train_neg[i]['label'] for i in range(train_split)]+[imdb_train_pos[i]['label'] for i in range(train_split)]

val_text = [i['text'] for i in imdb_train_neg[train_split:]]+ [i['text'] for i in imdb_train_pos[train_split:]]
val_label = [i['label'] for i in imdb_train_neg[train_split:]]+[i['label'] for i in imdb_train_pos[train_split:]]

test_text = [imdb_test_neg[i]['text'] for i in range(len(imdb_test_neg))]+[imdb_test_pos[i]['text'] for i in range(len(imdb_test_pos))]
test_label = [imdb_test_neg[i]['label'] for i in range(len(imdb_test_neg))]+[imdb_test_pos[i]['label'] for i in range(len(imdb_test_pos))]

In [22]:
pkl.dump(val_text, open('val_text.p' , "wb"))
pkl.dump(val_label, open('val_label.p' , "wb"))


In [90]:
sum(val_label), len(val_label)

(2500, 5000)

In [138]:
import spacy
import string

tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation


def ngram_dataset(dataset, n):

    token_dataset = []
    all_tokens = []
    for sample in tokenizer.pipe(dataset, disable=['parser', 'tagger', 'ner'], batch_size=512, n_threads=4):
        text = [token.text.lower() for token in sample if (token.text not in punctuations)]
        tokens = [' '.join(token) for token in ngrams(text, n)]
        token_dataset.append(tokens)
        all_tokens += tokens

    return token_dataset, all_tokens


In [14]:
import pickle as pkl
for n_gram in [1, 2, 3, 4]:   
    train_tokens, all_train_tokens = ngram_dataset(train_text, n_gram)
    val_tokens, _ = ngram_dataset(val_text, n_gram)
    test_tokens, _= ngram_dataset(test_text, n_gram)
    print('n gram:', n_gram)
    print ("Tokenizing train data")
    pkl.dump(train_tokens, open('tokens/train_set_tokens_%s_gram.p' % n_gram, "wb"))
    pkl.dump(all_train_tokens, open('tokens/all_train_tokens_%s_gram.p' % n_gram, "wb"))   
    print ("Tokenizing val data")
    pkl.dump(val_tokens, open('tokens/val_set_tokens_%s_gram.p' % n_gram, "wb"))
    print ("Tokenizing test data")
    pkl.dump(test_tokens, open('tokens/test_set_tokens_%s_gram.p' % n_gram, "wb"))
    print('===========================================================')

n gram: 1
Tokenizing train data
Tokenizing val data
Tokenizing test data
n gram: 2
Tokenizing train data
Tokenizing val data
Tokenizing test data
n gram: 3
Tokenizing train data
Tokenizing val data
Tokenizing test data
n gram: 4
Tokenizing train data
Tokenizing val data
Tokenizing test data


In [85]:
pkl.dump(train_label, open('tokens/train_labels.p', "wb"))
pkl.dump(val_label, open('tokens/val_labels.p', "wb"))
pkl.dump(test_label, open('tokens/test_labels.p', "wb"))

incorporate i<n n_gram into n_gram dataset 

In [16]:
for i in range(2, 5):
    print('dealing with %s_gram dataset'%i)
    print('load dataset')
    current_train_tokens =  pkl.load(open('tokens/train_set_tokens_%s_gram.p' % i, "rb"))
    before_train_tokens =  pkl.load(open('tokens/train_set_tokens_%s_gram.p' % str(i-1), "rb"))
    current_all_train_tokens = pkl.load(open('tokens/all_train_tokens_%s_gram.p' % i, "rb"))
    before_all_train_tokens = pkl.load(open('tokens/all_train_tokens_%s_gram.p' % str(i-1), "rb"))
    current_val_tokens = pkl.load(open('tokens/val_set_tokens_%s_gram.p' % i, "rb"))
    before_val_tokens = pkl.load(open('tokens/val_set_tokens_%s_gram.p' % str(i-1), "rb"))
    current_test_tokens= pkl.load(open('tokens/test_set_tokens_%s_gram.p' % i, "rb"))
    before_test_tokens = pkl.load(open('tokens/test_set_tokens_%s_gram.p' % str(i-1), "rb"))
    print('merge train tokens')
    new_train_tokens = [before_train_tokens[i] + current_train_tokens[i] for i in range(len(current_train_tokens))]
    print('merge all train tokens')
    new_all_train_tokens= before_all_train_tokens + current_all_train_tokens
    print('merge val tokens')
    new_val_tokens = [before_val_tokens[i] + current_val_tokens[i] for i in range(len(current_val_tokens))]
    print('merge test tokens')
    new_test_tokens = [before_test_tokens[i] + current_test_tokens[i] for i in range(len(current_test_tokens))]
        
    pkl.dump(new_train_tokens, open('tokens/train_set_tokens_%s_gram.p' % i, "wb"))
    pkl.dump(new_all_train_tokens, open('tokens/all_train_tokens_%s_gram.p' % i, "wb"))  
    pkl.dump(new_val_tokens, open('tokens/val_set_tokens_%s_gram.p' % i, "wb"))
    pkl.dump(new_test_tokens, open('tokens/test_set_tokens_%s_gram.p' % i, "wb"))
    print('===========================================================')

dealing with 2_gram dataset
load dataset
merge train tokens
merge all train tokens
merge val tokens
merge test tokens
dealing with 3_gram dataset
load dataset
merge train tokens
merge all train tokens
merge val tokens
merge test tokens
dealing with 4_gram dataset
load dataset
merge train tokens
merge all train tokens
merge val tokens
merge test tokens
