In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import zipfile


In [3]:
%mkdir ../data
!wget -O ../data/aclImdb_v1.tar.gz http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -zxf ../data/aclImdb_v1.tar.gz -C ../data

mkdir: ../data: File exists
--2023-04-30 23:36:57--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
正在解析主机 ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
正在连接 ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... 已连接。
已发出 HTTP 请求，正在等待回应... 200 OK
长度：84125825 (80M) [application/x-gzip]
正在保存至: “../data/aclImdb_v1.tar.gz”


2023-04-30 23:38:12 (1.07 MB/s) - 已保存 “../data/aclImdb_v1.tar.gz” [84125825/84125825])



In [6]:
import os
import glob


def read_imdb_data(data_dir='../data/aclImdb'):
    data = {}
    labels = {}

    for data_type in ['train', 'test']:
        data[data_type] = {}
        labels[data_type] = {}

        for sentiment in ['pos', 'neg']:
            data[data_type][sentiment] = []
            labels[data_type][sentiment] = []

            path = os.path.join(data_dir, data_type, sentiment, '*.txt')
            files = glob.glob(path)

            for f in files:
                with open(f) as review:
                    data[data_type][sentiment].append(review.read())
                    # Here we represent a positive review by '1' and a negative review by '0'
                    labels[data_type][sentiment].append(
                        1 if sentiment == 'pos' else 0)

            assert len(data[data_type][sentiment]) == len(labels[data_type][sentiment]), \
                "{}/{} data size does not match labels size".format(
                    data_type, sentiment)

    return data, labels


In [7]:
data, labels = read_imdb_data()
print("IMDB reviews: train = {} pos / {} neg, test = {} pos / {} neg".format(
    len(data['train']['pos']), len(data['train']['neg']),
    len(data['test']['pos']), len(data['test']['neg'])))


IMDB reviews: train = 12500 pos / 12500 neg, test = 12500 pos / 12500 neg


In [8]:
from sklearn.utils import shuffle


def prepare_imdb_data(data, labels):
    """Prepare training and test sets from IMDb movie reviews."""

    #Combine positive and negative reviews and labels
    data_train = data['train']['pos'] + data['train']['neg']
    data_test = data['test']['pos'] + data['test']['neg']
    labels_train = labels['train']['pos'] + labels['train']['neg']
    labels_test = labels['test']['pos'] + labels['test']['neg']

    #Shuffle reviews and corresponding labels within training and test sets
    data_train, labels_train = shuffle(data_train, labels_train)
    data_test, labels_test = shuffle(data_test, labels_test)

    # Return a unified training data, test data, training labels, test labets
    return data_train, data_test, labels_train, labels_test


In [9]:
train_X, test_X, train_y, test_y = prepare_imdb_data(data, labels)
print("IMDb reviews (combined): train = {}, test = {}".format(len(train_X), len(test_X)))

IMDb reviews (combined): train = 25000, test = 25000


In [10]:
print(train_X[100])
print(train_y[100])

A strong woman oriented subject after long, director Krishna Vamsi's Shakti- The Power, the Desi version of the Hollywood hit Not Without My Daughter is actress Sridevi's first home-production. A story about a woman's fight against harsh injustice.<br /><br />The story of the film revolves around Nandini (Karisma Kapoor) who lives in Canada with her two uncles (Tiku Talsania, Jaspal Bhatti). There she meets Shekhar (Sanjay Kapoor), falls in love with him and they soon marry. Their family is complete when Nandini has a boy, Raja (Master Jai Gidwani). But their happiness is short lived, as the news of Shekhar's ailing mother (Deepti Naval)makes them leave their perfect life in Canada and come to India. And that's when the problems start. From the moment they reach<br /><br />India, both are shocked to see the pollution and the vast throngs of people everywhere. They take a crowded train to reach Shekhar's village and when they finally reach the station, they have to catch a long bus driv

In [12]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *

import re
from bs4 import BeautifulSoup

def review_to_words(review):
    nltk.download("stopwords", quiet=True)
    stemmer = PorterStemmer()
    
    text = BeautifulSoup(review, "html.parser").get_text() # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) # Convert to lower case
    words = text.split() # Split string into words
    words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    words = [PorterStemmer().stem(w) for w in words] # stem
    
    return words

ModuleNotFoundError: No module named 'nltk'