In [7]:
import re
import random
import os
import numpy as np
from collections import Counter
from sklearn.feature_extraction import stop_words
from nltk import ngrams

In [8]:
imdb_home = 'aclImdb'

def load_imdb_data(path):
    
    pos_data, neg_data = [], []
    all_files = []
    #_limit = 250
    
    for dirpath, dirnames, files in os.walk(path):
        for name in files:
            all_files.append(os.path.join(dirpath, name))
            
            
    for file_path in all_files:
        if '/neg' in file_path:
            example = {}
            with open(file_path, 'r') as myfile:
                example['text'] = myfile.read().replace('\n', '')
            example['label'] = 0
            neg_data.append(example)
            
        if '/pos' in file_path:
            example = {}
            with open(file_path, 'r') as myfile:
                example['text'] = myfile.read().replace('\n', '')
            example['label'] = 1
            pos_data.append(example)
    data = neg_data + pos_data
    return data


In [9]:
imdb_train = load_imdb_data(imdb_home+'/train/')
imdb_test = load_imdb_data(imdb_home+'/test/')


In [10]:
#split the train dataset into 20,000 train examples and 5,000 validation example
train_split = 20000
train_text = [imdb_train[i]['text'] for i in range(train_split)]
train_label = [imdb_train[i]['label'] for i in range(train_split)]

val_text = [i['text'] for i in imdb_train[train_split:]]
val_label = [i['label'] for i in imdb_train[train_split:]]

test_text = [imdb_test[i]['text'] for i in range(len(imdb_test))]
test_label = [imdb_test[i]['label'] for i in range(len(imdb_test))]

In [11]:
import spacy
import string

tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation


def ngram_dataset(dataset, n):

    token_dataset = []
    all_tokens = []

    for sample in tokenizer.pipe(dataset, disable=['parser', 'tagger', 'ner'], batch_size=512, n_threads=4):
        text = [token.text.lower() for token in sample if (token.text not in punctuations)]
        tokens = [token for token in ngrams(text, n)]
        token_dataset.append(tokens)
        all_tokens += tokens

    return token_dataset, all_tokens


In [15]:
import pickle as pkl
for n_gram in [1, 2, 3, 4]:   
    train_tokens, all_train_tokens = ngram_dataset(train_text, n_gram)
    val_tokens, _ = ngram_dataset(val_text, n_gram)
    test_tokens, _= ngram_dataset(test_text, n_gram)
    print('n gram:', n_gram)
    print ("Tokenizing train data")
    pkl.dump(train_tokens, open('tokens/train_set_tokens_%s_gram.p' % n_gram, "wb"))
    pkl.dump(all_train_tokens, open('tokens/all_train_tokens_%s_gram.p' % n_gram, "wb"))   
    print ("Tokenizing val data")
    pkl.dump(val_tokens, open('tokens/val_set_tokens_%s_gram.p' % n_gram, "wb"))
    print ("Tokenizing test data")
    pkl.dump(test_tokens, open('tokens/test_set_tokens_%s_gram.p' % n_gram, "wb"))
    print('===========================================================')

n gram: 1
Tokenizing train data
Tokenizing val data
Tokenizing test data
n gram: 2
Tokenizing train data
Tokenizing val data
Tokenizing test data
n gram: 3
Tokenizing train data
Tokenizing val data
Tokenizing test data
n gram: 4
Tokenizing train data
Tokenizing val data
Tokenizing test data


In [None]:
pkl.dump(train_label, open('tokens/train_labels.p', "wb"))
pkl.dump(val_label, open('tokens/val_labels.p', "wb"))
pkl.dump(test_label, open('tokens/test_label.p', "wb"))