In [3]:
import pyprind
import pandas as pd
import os 

basepath = "./data/aclImdb"

labels = {"pos": 1, "neg": 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()

for s in ("test", "train"):
    for l in ("pos", "neg"):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), "r", encoding="utf-8") as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()
df.columns = ["review", "sentiment"]

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:59


In [6]:
import numpy as np

np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))

df.to_csv("./movie_data.csv", index=False)

df = pd.read_csv("./movie_data.csv")
df.head(3)

Unnamed: 0,review,sentiment
0,"OK,I've seen over 100 Troma films, and some of...",0
1,"How important is the director, anyway? In this...",1
2,Until the 1990s there had never been a film ba...,1


In [7]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
docs = np.array(["The sun is shining", 
                 "The weather is sweet", 
                 "The sun is shining the weather is sweet, and one and one is two"])
bag = count.fit_transform(docs)

In [8]:
print(count.vocabulary_)

{'the': 6, 'shining': 3, 'two': 7, 'and': 0, 'sun': 4, 'is': 1, 'one': 2, 'sweet': 5, 'weather': 8}


In [9]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(use_idf=True, norm="l2", smooth_idf=True)
np.set_printoptions(precision=2)

print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[ 0.    0.43  0.    0.56  0.56  0.    0.43  0.    0.  ]
 [ 0.    0.43  0.    0.    0.    0.56  0.43  0.    0.56]
 [ 0.5   0.45  0.5   0.19  0.19  0.19  0.3   0.25  0.19]]


In [12]:
def tokenizer(text):
    return text.split()

tokenizer("runner like running and thus they run")

['runner', 'like', 'running', 'and', 'thus', 'they', 'run']

In [13]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

tokenizer_porter("runner like running and thus they run")

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [14]:
import nltk

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [15]:
from nltk.corpus import stopwords

stop = stopwords.words("english")
[w for w in tokenizer_porter("a runner likes running and runs a lot")[-10:] if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

# Training a logistic regression model for document classification

In [16]:
X_train = df.loc[:25000, "review"].values
y_train = df.loc[:25000, "sentiment"].values
X_test = df.loc[25000:, "review"].values
y_test = df.loc[25000:, "sentiment"].values

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.grid_search import GridSearchCV

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessessor=None)

param_grid = [{"vect__ngram_range": [(1, 1)],
               "vect__stop_words": [(stop, None)],
               "vect__tokenizer": [(tokenizer, tokenizer_porter)],
               "clf_penalty": ["l1", "l2"],
               "clf_C": [1.0, 10.0, 100.0]},
              {"vect__ngram_range": [(1, 1)],
               "vect__stop_words": [(stop, None)],
               "vect__tokenizer": [(tokenizer, tokenizer_porter)],
               "vect__use_idf": [False]
               "clf_penalty": ["l1", "l2"],
               "clf_C": [1.0, 10.0, 100.0]},             
             ]

lr_tfidf = Pipeline([("vect", tfidf),
                    ("clf", LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring="accuracy",
                          cv=5,
                          verbose=1,
                          n_jobs=-1)

SyntaxError: invalid syntax (<ipython-input-19-c7fddecaeaa9>, line 19)

In [20]:
def stream_docs(path):
    with open(path, "r", encoding="utf-8") as csv:
        next(csv)
        for line in csv:
            text , label = line[:-3], int(line[-2])
            yield text, label