In [1]:
import os
import urllib.request
import tarfile

os.makedirs("data", exist_ok=True)

if not os.path.exists("data/aclImdb_v1.tar.gz"):
        #download database
        print("downloading database...")
        urllib.request.urlretrieve("http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", "data/aclImdb_v1.tar.gz")
        print("download complete")

if not os.path.exists("data/aclImdb/"):
        #extract database
        print("extracting database...")
        with tarfile.open("data/aclImdb_v1.tar.gz", "r:gz") as tar:
                tar.extractall(path="data")
        print("database extracted")


downloading database...
download complete
extracting database...


  tar.extractall(path="data")


database extracted


In [4]:
import os
import glob

def read_imdb_data(data_dir='data/aclImdb'):
    data = {}
    labels = {}

     # Loop over the two splits: training and testing
    for data_type in ['train', 'test']:
        data[data_type] = {}
        labels[data_type] = {}

        # Loop over both sentiment categories: positive and negative
        for sentiment in ['pos', 'neg']:
            data[data_type][sentiment] = []
            labels[data_type][sentiment] = []

            # Construct path to all text files of the current split and sentiment
            path = os.path.join(data_dir, data_type, sentiment, '*.txt')
            files = glob.glob(path)

            # Read each review text file
            for f in files:
                with open(f) as review:
                    data[data_type][sentiment].append(review.read())
                    # Assign label 1 for 'pos' and 0 for 'neg'
                    labels[data_type][sentiment].append(1 if sentiment == 'pos' else 0)

            # Sanity check: ensure that every text has a matching label
            assert len(data[data_type][sentiment]) == len(labels[data_type][sentiment]), \
                    "{}/{} data size does not match labels size".format(data_type, sentiment)

    return data, labels

In [5]:
data, labels = read_imdb_data()
print("IMDb reviews: train = {} pos / {} neg, test = {} pos / {} neg".format(
            len(data['train']['pos']), len(data['train']['neg']),
            len(data['test']['pos']), len(data['test']['neg'])))

IMDb reviews: train = 12500 pos / 12500 neg, test = 12500 pos / 12500 neg


In [6]:
import json
# Save the loaded review texts into a JSON file
json.dump(data, open('data/data.json', 'w'))
# Save the sentiment labels into another JSON file
json.dump(labels, open('data/labels.json', 'w'))

In [8]:
# Open and load the movie review data from 'data.json'
f=open('data/data.json')
data = json.load(f)
f.close()

# Open and load the sentiment labels from 'labels.json'
f=open('data/labels.json')
labels = json.load(f)
f.close()