In [2]:
import pyprind
import pandas as pd
import os 

basepath = "./data/aclImdb"

labels = {"pos": 1, "neg": 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()

for s in ("test", "train"):
    for l in ("pos", "neg"):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), "r", encoding="utf-8") as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()
df.columns = ["review", "sentiment"]

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:59


In [3]:
import numpy as np

np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))

df.to_csv("./movie_data.csv", index=False)

df = pd.read_csv("./movie_data.csv")
df.head(3)

Unnamed: 0,review,sentiment
0,Who ARE the people that star in this thing? Ne...,1
1,"Really, average is the only word that comes to...",0
2,First off... I never considered myself an Uwe ...,0


In [4]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
docs = np.array(["The sun is shining", 
                 "The weather is sweet", 
                 "The sun is shining the weather is sweet, and one and one is two"])
bag = count.fit_transform(docs)

In [5]:
print(count.vocabulary_)

{'sweet': 5, 'the': 6, 'is': 1, 'weather': 8, 'sun': 4, 'shining': 3, 'and': 0, 'one': 2, 'two': 7}


In [6]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(use_idf=True, norm="l2", smooth_idf=True)
np.set_printoptions(precision=2)

print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[ 0.    0.43  0.    0.56  0.56  0.    0.43  0.    0.  ]
 [ 0.    0.43  0.    0.    0.    0.56  0.43  0.    0.56]
 [ 0.5   0.45  0.5   0.19  0.19  0.19  0.3   0.25  0.19]]


In [7]:
def tokenizer(text):
    return text.split()

tokenizer("runner like running and thus they run")

['runner', 'like', 'running', 'and', 'thus', 'they', 'run']

In [8]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

tokenizer_porter("runner like running and thus they run")

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [9]:
import nltk

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [10]:
from nltk.corpus import stopwords

stop = stopwords.words("english")
[w for w in tokenizer_porter("a runner likes running and runs a lot")[-10:] if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

# Training a logistic regression model for document classification

In [11]:
X_train = df.loc[:25000, "review"].values
y_train = df.loc[:25000, "sentiment"].values
X_test = df.loc[25000:, "review"].values
y_test = df.loc[25000:, "sentiment"].values

In [22]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.grid_search import GridSearchCV

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False)

param_grid = [{"vect__ngram_range": [(1, 1)],
               "vect__stop_words": [(stop, None)],
               "vect__tokenizer": [(tokenizer, tokenizer_porter)],
               "clf_penalty": ["l1", "l2"],
               "clf_C": [1.0, 10.0, 100.0]},
              {"vect__ngram_range": [(1, 1)],
               "vect__stop_words": [(stop, None)],
               "vect__tokenizer": [(tokenizer, tokenizer_porter)],
               "vect__use_idf": [False],
               "clf_penalty": ["l1", "l2"],
               "clf_C": [1.0, 10.0, 100.0]},             
             ]

lr_tfidf = Pipeline([("vect", tfidf),
                    ("clf", LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring="accuracy",
                          cv=5,
                          verbose=1,
                          n_jobs=-1)

In [None]:
gs_lr_tfidf.fit(X_train, y_train)

In [None]:
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.cross_validation import StatifiedKFold
from sklearn.cross_validation import cross_val_score

np.random.seed(0)
np.set_printoptions(precision=6)
y = [np.random.randint(3) for i in range(25)]
X = (y + np.random.randn(25)).reshape(-1, 1)

cv5_idx = list(StratifiedFold(y, n_folds=5, shuffle=False, random_state=0))

cross_val_score(LogisticRegression(random_state=123), X, y, cv=cv5_idx)

In [None]:
from sklearn.grid_search import GridSearchCV

gs = GridSearchCV(LogisticRegression(), {}, cv=cv5_idx, verbose=3).fit(X, y)

In [None]:
print(gs.best_score_)
print(cross_val_score(LogisticRegression(), X, y, cv=cv5_idx).mean())

In [23]:
import numpy as np
import re
from nltk.corpus import stopwords

def tokenizer(text):
    text = re.sub("<[^>]*>", "", text)
    emoticons = re.findall("(?::|;|=)(?:-)?(?:\)|\(|D|P)", text.lower())
    text = re.sub("[\W]+", " ", text.lower()) + " ".join(emoticons).replacE("-", "")
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

def stream_docs(path):
    with open(path, "r", encoding="utf-8") as csv:
        next(csv)
        for line in csv:
            text , label = line[:-3], int(line[-2])
            yield text, label

In [24]:
next(stream_docs(path="./movie_data.csv"))

('"Who ARE the people that star in this thing? Never heard of them!! But this is one of the funniest comedies I have run across. It should win the Putz Puller Prize for Parody. The absurd starts with Dr. Jeykl snorting his powder and turning into a sex fiend.He is pursued by libido driven nurse early in the movie in one of the funniest scenes of the movie. Pay attention to the hospital PA system in the background; rather like the system in MASH. The final scene with Hyde accepting the award has had me laughing for years. Oh... and the ""Busty Nurse"" is Cassandra Peterson, who went on to become Elvira, Mistress of the Dark. <br /><br />If you liked the Mel Brooks classic movies (Blazing Saddles, etc.), I suspect you\'d like this one.<br /><br />Damn shame you can\'t get it on DVD anywhere.<br /><br />It\'s available on DVD now !!!!! Good thing DVDs don\'t wear out from use !!!!!"',
 1)

In [None]:
def get_minibatch(doc_stream. size):
    docs