In [1]:
import pyprind
import pandas as pd
import os 

basepath = "./data/aclImdb"

labels = {"pos": 1, "neg": 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()

for s in ("test", "train"):
    for l in ("pos", "neg"):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), "r", encoding="utf-8") as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()
df.columns = ["review", "sentiment"]

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:56


In [2]:
import numpy as np

np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))

df.to_csv("./movie_data.csv", index=False)

df = pd.read_csv("./movie_data.csv")
df.head(3)

Unnamed: 0,review,sentiment
0,Who ARE the people that star in this thing? Ne...,1
1,"Really, average is the only word that comes to...",0
2,First off... I never considered myself an Uwe ...,0


In [3]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
docs = np.array(["The sun is shining", 
                 "The weather is sweet", 
                 "The sun is shining the weather is sweet, and one and one is two"])
bag = count.fit_transform(docs)

In [4]:
print(count.vocabulary_)

{'sweet': 5, 'weather': 8, 'sun': 4, 'the': 6, 'is': 1, 'shining': 3, 'two': 7, 'one': 2, 'and': 0}


In [5]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(use_idf=True, norm="l2", smooth_idf=True)
np.set_printoptions(precision=2)

print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[ 0.    0.43  0.    0.56  0.56  0.    0.43  0.    0.  ]
 [ 0.    0.43  0.    0.    0.    0.56  0.43  0.    0.56]
 [ 0.5   0.45  0.5   0.19  0.19  0.19  0.3   0.25  0.19]]


In [6]:
def tokenizer(text):
    return text.split()

tokenizer("runner like running and thus they run")

['runner', 'like', 'running', 'and', 'thus', 'they', 'run']

In [7]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

tokenizer_porter("runner like running and thus they run")

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [8]:
import nltk

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
from nltk.corpus import stopwords

stop = stopwords.words("english")
[w for w in tokenizer_porter("a runner likes running and runs a lot")[-10:] if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

# Training a logistic regression model for document classification

In [10]:
X_train = df.loc[:25000, "review"].values
y_train = df.loc[:25000, "sentiment"].values
X_test = df.loc[25000:, "review"].values
y_test = df.loc[25000:, "sentiment"].values

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.grid_search import GridSearchCV

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1,
                           n_jobs=-1)

In [12]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:   18.3s


JoblibIndexError: JoblibIndexError
___________________________________________________________________________
Multiprocessing exception:
    ...........................................................................
/usr/lib/python3.4/runpy.py in _run_module_as_main(mod_name='IPython.kernel.__main__', alter_argv=1)
    165         sys.exit(msg)
    166     main_globals = sys.modules["__main__"].__dict__
    167     if alter_argv:
    168         sys.argv[0] = mod_spec.origin
    169     return _run_code(code, main_globals, None,
--> 170                      "__main__", mod_spec)
        mod_spec = ModuleSpec(name='IPython.kernel.__main__', loade...hon3.4/dist-packages/IPython/kernel/__main__.py')
    171 
    172 def run_module(mod_name, init_globals=None,
    173                run_name=None, alter_sys=False):
    174     """Execute a module's code without importing it

...........................................................................
/usr/lib/python3.4/runpy.py in _run_code(code=<code object <module> at 0x7f341c93a4b0, file "/...ist-packages/IPython/kernel/__main__.py", line 1>, run_globals={'__builtins__': <module 'builtins' (built-in)>, '__cached__': '/usr/local/lib/python3.4/dist-packages/IPython/kernel/__pycache__/__main__.cpython-34.pyc', '__doc__': None, '__file__': '/usr/local/lib/python3.4/dist-packages/IPython/kernel/__main__.py', '__loader__': <_frozen_importlib.SourceFileLoader object>, '__name__': '__main__', '__package__': 'IPython.kernel', '__spec__': ModuleSpec(name='IPython.kernel.__main__', loade...hon3.4/dist-packages/IPython/kernel/__main__.py'), 'app': <module 'IPython.kernel.zmq.kernelapp' from '/us...4/dist-packages/IPython/kernel/zmq/kernelapp.py'>}, init_globals=None, mod_name='__main__', mod_spec=ModuleSpec(name='IPython.kernel.__main__', loade...hon3.4/dist-packages/IPython/kernel/__main__.py'), pkg_name='IPython.kernel', script_name=None)
     80                        __cached__ = cached,
     81                        __doc__ = None,
     82                        __loader__ = loader,
     83                        __package__ = pkg_name,
     84                        __spec__ = mod_spec)
---> 85     exec(code, run_globals)
        code = <code object <module> at 0x7f341c93a4b0, file "/...ist-packages/IPython/kernel/__main__.py", line 1>
        run_globals = {'__builtins__': <module 'builtins' (built-in)>, '__cached__': '/usr/local/lib/python3.4/dist-packages/IPython/kernel/__pycache__/__main__.cpython-34.pyc', '__doc__': None, '__file__': '/usr/local/lib/python3.4/dist-packages/IPython/kernel/__main__.py', '__loader__': <_frozen_importlib.SourceFileLoader object>, '__name__': '__main__', '__package__': 'IPython.kernel', '__spec__': ModuleSpec(name='IPython.kernel.__main__', loade...hon3.4/dist-packages/IPython/kernel/__main__.py'), 'app': <module 'IPython.kernel.zmq.kernelapp' from '/us...4/dist-packages/IPython/kernel/zmq/kernelapp.py'>}
     86     return run_globals
     87 
     88 def _run_module_code(code, init_globals=None,
     89                     mod_name=None, mod_spec=None,

...........................................................................
/usr/local/lib/python3.4/dist-packages/IPython/kernel/__main__.py in <module>()
      1 
      2 
----> 3 
      4 if __name__ == '__main__':
      5     from IPython.kernel.zmq import kernelapp as app
      6     app.launch_new_instance()
      7 
      8 
      9 
     10 

...........................................................................
/usr/local/lib/python3.4/dist-packages/IPython/config/application.py in launch_instance(cls=<class 'IPython.kernel.zmq.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    569         
    570         If a global instance already exists, this reinitializes and starts it
    571         """
    572         app = cls.instance(**kwargs)
    573         app.initialize(argv)
--> 574         app.start()
        app.start = <bound method IPKernelApp.start of <IPython.kernel.zmq.kernelapp.IPKernelApp object>>
    575 
    576 #-----------------------------------------------------------------------------
    577 # utility functions, for convenience
    578 #-----------------------------------------------------------------------------

...........................................................................
/usr/local/lib/python3.4/dist-packages/IPython/kernel/zmq/kernelapp.py in start(self=<IPython.kernel.zmq.kernelapp.IPKernelApp object>)
    368     def start(self):
    369         if self.poller is not None:
    370             self.poller.start()
    371         self.kernel.start()
    372         try:
--> 373             ioloop.IOLoop.instance().start()
    374         except KeyboardInterrupt:
    375             pass
    376 
    377 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
/usr/local/lib/python3.4/dist-packages/zmq/eventloop/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    157             PollIOLoop.configure(ZMQIOLoop)
    158         return PollIOLoop.current(*args, **kwargs)
    159     
    160     def start(self):
    161         try:
--> 162             super(ZMQIOLoop, self).start()
        self.start = <bound method ZMQIOLoop.start of <zmq.eventloop.ioloop.ZMQIOLoop object>>
    163         except ZMQError as e:
    164             if e.errno == ETERM:
    165                 # quietly return on ETERM
    166                 pass

...........................................................................
/usr/local/lib/python3.4/dist-packages/tornado/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    878                 self._events.update(event_pairs)
    879                 while self._events:
    880                     fd, events = self._events.popitem()
    881                     try:
    882                         fd_obj, handler_func = self._handlers[fd]
--> 883                         handler_func(fd_obj, events)
        handler_func = <function wrap.<locals>.null_wrapper>
        fd_obj = <zmq.sugar.socket.Socket object>
        events = 5
    884                     except (OSError, IOError) as e:
    885                         if errno_from_exception(e) == errno.EPIPE:
    886                             # Happens when the client closes the connection
    887                             pass

...........................................................................
/usr/local/lib/python3.4/dist-packages/tornado/stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 5), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 5)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
/usr/local/lib/python3.4/dist-packages/zmq/eventloop/zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=5)
    435             # dispatch events:
    436             if events & IOLoop.ERROR:
    437                 gen_log.error("got POLLERR event on ZMQStream, which doesn't make sense")
    438                 return
    439             if events & IOLoop.READ:
--> 440                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    441                 if not self.socket:
    442                     return
    443             if events & IOLoop.WRITE:
    444                 self._handle_send()

...........................................................................
/usr/local/lib/python3.4/dist-packages/zmq/eventloop/zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    467                 gen_log.error("RECV Error: %s"%zmq.strerror(e.errno))
    468         else:
    469             if self._recv_callback:
    470                 callback = self._recv_callback
    471                 # self._recv_callback = None
--> 472                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function wrap.<locals>.null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    473                 
    474         # self.update_state()
    475         
    476 

...........................................................................
/usr/local/lib/python3.4/dist-packages/zmq/eventloop/zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function wrap.<locals>.null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    409         close our socket."""
    410         try:
    411             # Use a NullContext to ensure that all StackContexts are run
    412             # inside our blanket exception handler rather than outside.
    413             with stack_context.NullContext():
--> 414                 callback(*args, **kwargs)
        callback = <function wrap.<locals>.null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    415         except:
    416             gen_log.error("Uncaught exception, closing connection.",
    417                           exc_info=True)
    418             # Close the socket on an uncaught exception from a user callback

...........................................................................
/usr/local/lib/python3.4/dist-packages/tornado/stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
/usr/local/lib/python3.4/dist-packages/IPython/kernel/zmq/kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    247         if self.control_stream:
    248             self.control_stream.on_recv(self.dispatch_control, copy=False)
    249 
    250         def make_dispatcher(stream):
    251             def dispatcher(msg):
--> 252                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    253             return dispatcher
    254 
    255         for s in self.shell_streams:
    256             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
/usr/local/lib/python3.4/dist-packages/IPython/kernel/zmq/kernelbase.py in dispatch_shell(self=<IPython.kernel.zmq.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {'allow_stdin': True, 'code': 'gs_lr_tfidf.fit(X_train, y_train)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'msg_id': '5EB6E9184D4847FC81967D8606E502E8', 'msg_type': 'execute_request', 'session': '097527A6D76A4C3085695492836F77F4', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '5EB6E9184D4847FC81967D8606E502E8', 'msg_type': 'execute_request', 'parent_header': {}})
    208         else:
    209             # ensure default_int_handler during handler call
    210             sig = signal(SIGINT, default_int_handler)
    211             self.log.debug("%s: %s", msg_type, msg)
    212             try:
--> 213                 handler(stream, idents, msg)
        handler = <bound method IPythonKernel.execute_request of <IPython.kernel.zmq.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = [b'097527A6D76A4C3085695492836F77F4']
        msg = {'buffers': [], 'content': {'allow_stdin': True, 'code': 'gs_lr_tfidf.fit(X_train, y_train)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'msg_id': '5EB6E9184D4847FC81967D8606E502E8', 'msg_type': 'execute_request', 'session': '097527A6D76A4C3085695492836F77F4', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '5EB6E9184D4847FC81967D8606E502E8', 'msg_type': 'execute_request', 'parent_header': {}}
    214             except Exception:
    215                 self.log.error("Exception in message handler:", exc_info=True)
    216             finally:
    217                 signal(SIGINT, sig)

...........................................................................
/usr/local/lib/python3.4/dist-packages/IPython/kernel/zmq/kernelbase.py in execute_request(self=<IPython.kernel.zmq.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=[b'097527A6D76A4C3085695492836F77F4'], parent={'buffers': [], 'content': {'allow_stdin': True, 'code': 'gs_lr_tfidf.fit(X_train, y_train)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'msg_id': '5EB6E9184D4847FC81967D8606E502E8', 'msg_type': 'execute_request', 'session': '097527A6D76A4C3085695492836F77F4', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '5EB6E9184D4847FC81967D8606E502E8', 'msg_type': 'execute_request', 'parent_header': {}})
    357         if not silent:
    358             self.execution_count += 1
    359             self._publish_execute_input(code, parent, self.execution_count)
    360         
    361         reply_content = self.do_execute(code, silent, store_history,
--> 362                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    363 
    364         # Flush output before sending the reply.
    365         sys.stdout.flush()
    366         sys.stderr.flush()

...........................................................................
/usr/local/lib/python3.4/dist-packages/IPython/kernel/zmq/ipkernel.py in do_execute(self=<IPython.kernel.zmq.ipkernel.IPythonKernel object>, code='gs_lr_tfidf.fit(X_train, y_train)', silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    176 
    177         reply_content = {}
    178         # FIXME: the shell calls the exception handler itself.
    179         shell._reply_content = None
    180         try:
--> 181             shell.run_cell(code, store_history=store_history, silent=silent)
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <I....kernel.zmq.zmqshell.ZMQInteractiveShell object>>
        code = 'gs_lr_tfidf.fit(X_train, y_train)'
        store_history = True
        silent = False
    182         except:
    183             status = u'error'
    184             # FIXME: this code right now isn't being used yet by default,
    185             # because the run_cell() call above directly fires off exception

...........................................................................
/usr/local/lib/python3.4/dist-packages/IPython/core/interactiveshell.py in run_cell(self=<IPython.kernel.zmq.zmqshell.ZMQInteractiveShell object>, raw_cell='gs_lr_tfidf.fit(X_train, y_train)', store_history=True, silent=False, shell_futures=True)
   2866                 self.displayhook.exec_result = result
   2867 
   2868                 # Execute the user code
   2869                 interactivity = "none" if silent else self.ast_node_interactivity
   2870                 self.run_ast_nodes(code_ast.body, cell_name,
-> 2871                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler object>
   2872 
   2873                 # Reset this so later displayed values do not modify the
   2874                 # ExecutionResult
   2875                 self.displayhook.exec_result = None

...........................................................................
/usr/local/lib/python3.4/dist-packages/IPython/core/interactiveshell.py in run_ast_nodes(self=<IPython.kernel.zmq.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.Expr object>], cell_name='<ipython-input-12-7c8b397eb30b>', interactivity='last', compiler=<IPython.core.compilerop.CachingCompiler object>, result=<IPython.core.interactiveshell.ExecutionResult object>)
   2976                     return True
   2977 
   2978             for i, node in enumerate(to_run_interactive):
   2979                 mod = ast.Interactive([node])
   2980                 code = compiler(mod, cell_name, "single")
-> 2981                 if self.run_code(code, result):
        self.run_code = <bound method ZMQInteractiveShell.run_code of <I....kernel.zmq.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x7f33c8ac0d20, file "<ipython-input-12-7c8b397eb30b>", line 1>
        result = <IPython.core.interactiveshell.ExecutionResult object>
   2982                     return True
   2983 
   2984             # Flush softspace
   2985             if softspace(sys.stdout, 0):

...........................................................................
/usr/local/lib/python3.4/dist-packages/IPython/core/interactiveshell.py in run_code(self=<IPython.kernel.zmq.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x7f33c8ac0d20, file "<ipython-input-12-7c8b397eb30b>", line 1>, result=<IPython.core.interactiveshell.ExecutionResult object>)
   3030         outflag = 1  # happens in more places, so it's easier as default
   3031         try:
   3032             try:
   3033                 self.hooks.pre_run_code_hook()
   3034                 #rprint('Running code', repr(code_obj)) # dbg
-> 3035                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x7f33c8ac0d20, file "<ipython-input-12-7c8b397eb30b>", line 1>
        self.user_global_ns = {'CountVectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'GridSearchCV': <class 'sklearn.grid_search.GridSearchCV'>, 'In': ['', 'import pyprind\nimport pandas as pd\nimport os \n\nb...bar.update()\ndf.columns = ["review", "sentiment"]', 'import numpy as np\n\nnp.random.seed(0)\ndf = df.re...\n\ndf = pd.read_csv("./movie_data.csv")\ndf.head(3)', 'import numpy as np\nfrom sklearn.feature_extracti...and one is two"])\nbag = count.fit_transform(docs)', 'print(count.vocabulary_)', 'from sklearn.feature_extraction.text import Tfid...t_transform(count.fit_transform(docs)).toarray())', 'def tokenizer(text):\n    return text.split()\n\ntokenizer("runner like running and thus they run")', 'from nltk.stem.porter import PorterStemmer\n\nport...r_porter("runner like running and thus they run")', 'import nltk\n\nnltk.download("stopwords")', 'from nltk.corpus import stopwords\n\nstop = stopwo... running and runs a lot")[-10:] if w not in stop]', 'X_train = df.loc[:25000, "review"].values\ny_trai...alues\ny_test = df.loc[25000:, "sentiment"].values', 'from sklearn.pipeline import Pipeline\nfrom sklea... verbose=1,\n                           n_jobs=-1)', 'gs_lr_tfidf.fit(X_train, y_train)'], 'LogisticRegression': <class 'sklearn.linear_model.logistic.LogisticRegression'>, 'Out': {2:                                               re.... I never considered myself an Uwe ...          0, 6: ['runner', 'like', 'running', 'and', 'thus', 'they', 'run'], 7: ['runner', 'like', 'run', 'and', 'thu', 'they', 'run'], 8: True, 9: ['runner', 'like', 'run', 'run', 'lot']}, 'Pipeline': <class 'sklearn.pipeline.Pipeline'>, 'PorterStemmer': <class 'nltk.stem.porter.PorterStemmer'>, 'TfidfTransformer': <class 'sklearn.feature_extraction.text.TfidfTransformer'>, 'TfidfVectorizer': <class 'sklearn.feature_extraction.text.TfidfVectorizer'>, 'X_test': array([ "I really liked the idea of traveling be... now. This movie WILL be on DVD."], dtype=object), ...}
        self.user_ns = {'CountVectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'GridSearchCV': <class 'sklearn.grid_search.GridSearchCV'>, 'In': ['', 'import pyprind\nimport pandas as pd\nimport os \n\nb...bar.update()\ndf.columns = ["review", "sentiment"]', 'import numpy as np\n\nnp.random.seed(0)\ndf = df.re...\n\ndf = pd.read_csv("./movie_data.csv")\ndf.head(3)', 'import numpy as np\nfrom sklearn.feature_extracti...and one is two"])\nbag = count.fit_transform(docs)', 'print(count.vocabulary_)', 'from sklearn.feature_extraction.text import Tfid...t_transform(count.fit_transform(docs)).toarray())', 'def tokenizer(text):\n    return text.split()\n\ntokenizer("runner like running and thus they run")', 'from nltk.stem.porter import PorterStemmer\n\nport...r_porter("runner like running and thus they run")', 'import nltk\n\nnltk.download("stopwords")', 'from nltk.corpus import stopwords\n\nstop = stopwo... running and runs a lot")[-10:] if w not in stop]', 'X_train = df.loc[:25000, "review"].values\ny_trai...alues\ny_test = df.loc[25000:, "sentiment"].values', 'from sklearn.pipeline import Pipeline\nfrom sklea... verbose=1,\n                           n_jobs=-1)', 'gs_lr_tfidf.fit(X_train, y_train)'], 'LogisticRegression': <class 'sklearn.linear_model.logistic.LogisticRegression'>, 'Out': {2:                                               re.... I never considered myself an Uwe ...          0, 6: ['runner', 'like', 'running', 'and', 'thus', 'they', 'run'], 7: ['runner', 'like', 'run', 'and', 'thu', 'they', 'run'], 8: True, 9: ['runner', 'like', 'run', 'run', 'lot']}, 'Pipeline': <class 'sklearn.pipeline.Pipeline'>, 'PorterStemmer': <class 'nltk.stem.porter.PorterStemmer'>, 'TfidfTransformer': <class 'sklearn.feature_extraction.text.TfidfTransformer'>, 'TfidfVectorizer': <class 'sklearn.feature_extraction.text.TfidfVectorizer'>, 'X_test': array([ "I really liked the idea of traveling be... now. This movie WILL be on DVD."], dtype=object), ...}
   3036             finally:
   3037                 # Reset our crash handler in place
   3038                 sys.excepthook = old_excepthook
   3039         except SystemExit as e:

...........................................................................
/home/<ipython-input-12-7c8b397eb30b> in <module>()
----> 1 
      2 
      3 
      4 
      5 
      6 gs_lr_tfidf.fit(X_train, y_train)
      7 
      8 
      9 
     10 

...........................................................................
/usr/local/lib/python3.4/dist-packages/sklearn/grid_search.py in fit(self=GridSearchCV(cv=5,
       estimator=Pipeline(ste..._func=None,
       scoring='accuracy', verbose=1), X=array([ 'Who ARE the people that star in this th...an a tired band of knit pickers."], dtype=object), y=array([1, 0, 0, ..., 1, 1, 0]))
    591         y : array-like, shape = [n_samples] or [n_samples, n_output], optional
    592             Target relative to X for classification or regression;
    593             None for unsupervised learning.
    594 
    595         """
--> 596         return self._fit(X, y, ParameterGrid(self.param_grid))
        self._fit = <bound method GridSearchCV._fit of GridSearchCV(...func=None,
       scoring='accuracy', verbose=1)>
        X = array([ 'Who ARE the people that star in this th...an a tired band of knit pickers."], dtype=object)
        y = array([1, 0, 0, ..., 1, 1, 0])
        self.param_grid = [{'clf__C': [1.0, 10.0, 100.0], 'clf__penalty': ['l1', 'l2'], 'vect__ngram_range': [(1, 1)], 'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', ...], None], 'vect__tokenizer': [<function tokenizer>, <function tokenizer_porter>]}, {'clf__C': [1.0, 10.0, 100.0], 'clf__penalty': ['l1', 'l2'], 'vect__ngram_range': [(1, 1)], 'vect__norm': [None], 'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', ...], None], 'vect__tokenizer': [<function tokenizer>, <function tokenizer_porter>], 'vect__use_idf': [False]}]
    597 
    598 
    599 class RandomizedSearchCV(BaseSearchCV):
    600     """Randomized search on hyper parameters.

...........................................................................
/usr/local/lib/python3.4/dist-packages/sklearn/grid_search.py in _fit(self=GridSearchCV(cv=5,
       estimator=Pipeline(ste..._func=None,
       scoring='accuracy', verbose=1), X=array([ 'Who ARE the people that star in this th...an a tired band of knit pickers."], dtype=object), y=array([1, 0, 0, ..., 1, 1, 0]), parameter_iterable=<sklearn.grid_search.ParameterGrid object>)
    373             pre_dispatch=pre_dispatch
    374         )(
    375             delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
    376                                     train, test, self.verbose, parameters,
    377                                     self.fit_params, return_parameters=True)
--> 378             for parameters in parameter_iterable
        parameters = undefined
        parameter_iterable = <sklearn.grid_search.ParameterGrid object>
    379             for train, test in cv)
    380 
    381         # Out is a list of triplet: score, estimator, n_test_samples
    382         n_fits = len(out)

...........................................................................
/usr/local/lib/python3.4/dist-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=-1), iterable=<itertools.islice object>)
    655             if pre_dispatch == "all" or n_jobs == 1:
    656                 # The iterable was consumed all at once by the above for loop.
    657                 # No need to wait for async callbacks to trigger to
    658                 # consumption.
    659                 self._iterating = False
--> 660             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=-1)>
    661             # Make sure that we get a last message telling us we are done
    662             elapsed_time = time.time() - self._start_time
    663             self._print('Done %3i out of %3i | elapsed: %s finished',
    664                         (len(self._output),

    ---------------------------------------------------------------------------
    Sub-process traceback:
    ---------------------------------------------------------------------------
    IndexError                                         Mon Mar 27 22:22:23 2017
PID: 51                                      Python 3.4.3: /usr/bin/python3
...........................................................................
/usr/local/lib/python3.4/dist-packages/sklearn/cross_validation.py in _fit_and_score(estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyze...g=1, penalty='l1', random_state=0, tol=0.0001))]), X=array([ 'Who ARE the people that star in this th...an a tired band of knit pickers."], dtype=object), y=array([1, 0, 0, ..., 1, 1, 0]), scorer=make_scorer(accuracy_score), train=array([ 4942,  4948,  4949, ..., 24998, 24999, 25000]), test=array([   0,    1,    2, ..., 5066, 5067, 5068]), verbose=1, parameters={'clf__C': 1.0, 'clf__penalty': 'l1', 'vect__ngram_range': (1, 1), 'vect__stop_words': ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', ...], 'vect__tokenizer': <function tokenizer_porter>}, fit_params={}, return_train_score=False, return_parameters=True)
   1234     X_train, y_train = _safe_split(estimator, X, y, train)
   1235     X_test, y_test = _safe_split(estimator, X, y, test, train)
   1236     if y_train is None:
   1237         estimator.fit(X_train, **fit_params)
   1238     else:
-> 1239         estimator.fit(X_train, y_train, **fit_params)
        estimator.fit = <bound method Pipeline.fit of Pipeline(steps=[('...=1, penalty='l1', random_state=0, tol=0.0001))])>
        X_train = array([ 'I have nothing to comment on this movie...an a tired band of knit pickers."], dtype=object)
        y_train = array([0, 0, 0, ..., 1, 1, 0])
        fit_params = {}
   1240     test_score = _score(estimator, X_test, y_test, scorer)
   1241     if return_train_score:
   1242         train_score = _score(estimator, X_train, y_train, scorer)
   1243 

...........................................................................
/usr/local/lib/python3.4/dist-packages/sklearn/pipeline.py in fit(self=Pipeline(steps=[('vect', TfidfVectorizer(analyze...g=1, penalty='l1', random_state=0, tol=0.0001))]), X=array([ 'I have nothing to comment on this movie...an a tired band of knit pickers."], dtype=object), y=array([0, 0, 0, ..., 1, 1, 0]), **fit_params={})
    124 
    125     def fit(self, X, y=None, **fit_params):
    126         """Fit all the transforms one after the other and transform the
    127         data, then fit the transformed data using the final estimator.
    128         """
--> 129         Xt, fit_params = self._pre_transform(X, y, **fit_params)
        Xt = undefined
        fit_params = {}
        self._pre_transform = <bound method Pipeline._pre_transform of Pipelin...=1, penalty='l1', random_state=0, tol=0.0001))])>
        X = array([ 'I have nothing to comment on this movie...an a tired band of knit pickers."], dtype=object)
        y = array([0, 0, 0, ..., 1, 1, 0])
    130         self.steps[-1][-1].fit(Xt, y, **fit_params)
    131         return self
    132 
    133     def fit_transform(self, X, y=None, **fit_params):

...........................................................................
/usr/local/lib/python3.4/dist-packages/sklearn/pipeline.py in _pre_transform(self=Pipeline(steps=[('vect', TfidfVectorizer(analyze...g=1, penalty='l1', random_state=0, tol=0.0001))]), X=array([ 'I have nothing to comment on this movie...an a tired band of knit pickers."], dtype=object), y=array([0, 0, 0, ..., 1, 1, 0]), **fit_params={})
    114             step, param = pname.split('__', 1)
    115             fit_params_steps[step][param] = pval
    116         Xt = X
    117         for name, transform in self.steps[:-1]:
    118             if hasattr(transform, "fit_transform"):
--> 119                 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
        Xt = array([ 'I have nothing to comment on this movie...an a tired band of knit pickers."], dtype=object)
        transform.fit_transform = <bound method TfidfVectorizer.fit_transform of T...f202378>,
        use_idf=True, vocabulary=None)>
        y = array([0, 0, 0, ..., 1, 1, 0])
        fit_params_steps = {'clf': {}, 'vect': {}}
        name = 'vect'
    120             else:
    121                 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \
    122                               .transform(Xt)
    123         return Xt, fit_params_steps[self.steps[-1][0]]

...........................................................................
/usr/local/lib/python3.4/dist-packages/sklearn/feature_extraction/text.py in fit_transform(self=TfidfVectorizer(analyzer='word', binary=False, c...cf202378>,
        use_idf=True, vocabulary=None), raw_documents=array([ 'I have nothing to comment on this movie...an a tired band of knit pickers."], dtype=object), y=array([0, 0, 0, ..., 1, 1, 0]))
   1277         Returns
   1278         -------
   1279         X : sparse matrix, [n_samples, n_features]
   1280             Tf-idf-weighted document-term matrix.
   1281         """
-> 1282         X = super(TfidfVectorizer, self).fit_transform(raw_documents)
        X = undefined
        self.fit_transform = <bound method TfidfVectorizer.fit_transform of T...f202378>,
        use_idf=True, vocabulary=None)>
        raw_documents = array([ 'I have nothing to comment on this movie...an a tired band of knit pickers."], dtype=object)
   1283         self._tfidf.fit(X)
   1284         # X is already a transformed view of raw_documents so
   1285         # we set copy to False
   1286         return self._tfidf.transform(X, copy=False)

...........................................................................
/usr/local/lib/python3.4/dist-packages/sklearn/feature_extraction/text.py in fit_transform(self=TfidfVectorizer(analyzer='word', binary=False, c...cf202378>,
        use_idf=True, vocabulary=None), raw_documents=array([ 'I have nothing to comment on this movie...an a tired band of knit pickers."], dtype=object), y=None)
    812         max_df = self.max_df
    813         min_df = self.min_df
    814         max_features = self.max_features
    815 
    816         vocabulary, X = self._count_vocab(raw_documents,
--> 817                                           self.fixed_vocabulary_)
        self.fixed_vocabulary_ = False
    818 
    819         if self.binary:
    820             X.data.fill(1)
    821 

...........................................................................
/usr/local/lib/python3.4/dist-packages/sklearn/feature_extraction/text.py in _count_vocab(self=TfidfVectorizer(analyzer='word', binary=False, c...cf202378>,
        use_idf=True, vocabulary=None), raw_documents=array([ 'I have nothing to comment on this movie...an a tired band of knit pickers."], dtype=object), fixed_vocab=False)
    743         analyze = self.build_analyzer()
    744         j_indices = _make_int_array()
    745         indptr = _make_int_array()
    746         indptr.append(0)
    747         for doc in raw_documents:
--> 748             for feature in analyze(doc):
        feature = 'watch.'
        analyze = <function VectorizerMixin.build_analyzer.<locals>.<lambda>>
        doc = "This mini-series is actually more entertaining t...er's and the OED consider it an alternative form."
    749                 try:
    750                     j_indices.append(vocabulary[feature])
    751                 except KeyError:
    752                     # Ignore out-of-vocabulary items for fixed_vocab=True

...........................................................................
/usr/local/lib/python3.4/dist-packages/sklearn/feature_extraction/text.py in <lambda>(doc="This mini-series is actually more entertaining t...er's and the OED consider it an alternative form.")
    229         elif self.analyzer == 'word':
    230             stop_words = self.get_stop_words()
    231             tokenize = self.build_tokenizer()
    232 
    233             return lambda doc: self._word_ngrams(
--> 234                 tokenize(preprocess(self.decode(doc))), stop_words)
        doc = "This mini-series is actually more entertaining t...er's and the OED consider it an alternative form."
    235 
    236         else:
    237             raise ValueError('%s is not a valid tokenization scheme/analyzer' %
    238                              self.analyzer)

...........................................................................
/home/<ipython-input-7-0a0b85ac469f> in tokenizer_porter(text="This mini-series is actually more entertaining t...er's and the OED consider it an alternative form.")
      1 from nltk.stem.porter import PorterStemmer
      2 
      3 porter = PorterStemmer()
      4 
      5 def tokenizer_porter(text):
----> 6     return [porter.stem(word) for word in text.split()]
      7 
      8 tokenizer_porter("runner like running and thus they run")
      9 
     10 

...........................................................................
/home/<ipython-input-7-0a0b85ac469f> in <listcomp>(.0=<list_iterator object>)
      1 from nltk.stem.porter import PorterStemmer
      2 
      3 porter = PorterStemmer()
      4 
      5 def tokenizer_porter(text):
----> 6     return [porter.stem(word) for word in text.split()]
      7 
      8 tokenizer_porter("runner like running and thus they run")
      9 
     10 

...........................................................................
/usr/local/lib/python3.4/dist-packages/nltk/stem/porter.py in stem(self=<PorterStemmer>, word='OED')
    660             # the stemming process, although no mention is made of this
    661             # in the published algorithm.
    662             return word
    663 
    664         stem = self._step1a(stem)
--> 665         stem = self._step1b(stem)
        stem = 'oed'
        self._step1b = <bound method PorterStemmer._step1b of <PorterStemmer>>
    666         stem = self._step1c(stem)
    667         stem = self._step2(stem)
    668         stem = self._step3(stem)
    669         stem = self._step4(stem)

...........................................................................
/usr/local/lib/python3.4/dist-packages/nltk/stem/porter.py in _step1b(self=<PorterStemmer>, word='oed')
    371             ),
    372             # (m=1 and *o) -> E
    373             (
    374                 '',
    375                 'e',
--> 376                 lambda stem: (self._measure(stem) == 1 and
        stem = undefined
        self._measure = <bound method PorterStemmer._measure of <PorterStemmer>>
        self._ends_cvc = <bound method PorterStemmer._ends_cvc of <PorterStemmer>>
    377                               self._ends_cvc(stem))
    378             ),
    379         ])
    380     

...........................................................................
/usr/local/lib/python3.4/dist-packages/nltk/stem/porter.py in _apply_rule_list(self=<PorterStemmer>, word='o', rules=[('at', 'ate', None), ('bl', 'ble', None), ('iz', 'ize', None), ('*d', 'o', <function PorterStemmer._step1b.<locals>.<lambda>>), ('', 'e', <function PorterStemmer._step1b.<locals>.<lambda>>)])
    253         final element being the condition for the rule to be applicable,
    254         or None if the rule is unconditional.
    255         """
    256         for rule in rules:
    257             suffix, replacement, condition = rule
--> 258             if suffix == '*d' and self._ends_double_consonant(word):
        suffix = '*d'
        self._ends_double_consonant = <bound method PorterStemmer._ends_double_consonant of <PorterStemmer>>
        word = 'o'
    259                 stem = word[:-2]
    260                 if condition is None or condition(stem):
    261                     return stem + replacement
    262                 else:

...........................................................................
/usr/local/lib/python3.4/dist-packages/nltk/stem/porter.py in _ends_double_consonant(self=<PorterStemmer>, word='o')
    209         """Implements condition *d from the paper
    210         
    211         Returns True if word ends with a double consonant
    212         """
    213         return (
--> 214             word[-1] == word[-2] and
        word = 'o'
    215             self._is_consonant(word, len(word)-1)
    216         )
    217 
    218     def _ends_cvc(self, word):

IndexError: string index out of range
___________________________________________________________________________

In [13]:
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import cross_val_score

np.random.seed(0)
np.set_printoptions(precision=6)
y = [np.random.randint(3) for i in range(25)]
X = (y + np.random.randn(25)).reshape(-1, 1)

cv5_idx = list(StratifiedKFold(y, n_folds=5, shuffle=False, random_state=0))

cross_val_score(LogisticRegression(random_state=123), X, y, cv=cv5_idx)

array([ 0.6,  0.4,  0.6,  0.2,  0.6])

In [14]:
from sklearn.grid_search import GridSearchCV

gs = GridSearchCV(LogisticRegression(), {}, cv=cv5_idx, verbose=3).fit(X, y)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................
[CV] ....................................... , score=0.600000 -   0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.400000 -   0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.600000 -   0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.200000 -   0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.600000 -   0.0s


[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished


In [15]:
print(gs.best_score_)
print(cross_val_score(LogisticRegression(), X, y, cv=cv5_idx).mean())

0.48
0.48


In [16]:
import numpy as np
import re
from nltk.corpus import stopwords

def tokenizer(text):
    text = re.sub("<[^>]*>", "", text)
    emoticons = re.findall("(?::|;|=)(?:-)?(?:\)|\(|D|P)", text.lower())
    text = re.sub("[\W]+", " ", text.lower()) + " ".join(emoticons).replace("-", "")
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

def stream_docs(path):
    with open(path, "r", encoding="utf-8") as csv:
        next(csv)
        for line in csv:
            text , label = line[:-3], int(line[-2])
            yield text, label

In [17]:
next(stream_docs(path="./movie_data.csv"))

('"Who ARE the people that star in this thing? Never heard of them!! But this is one of the funniest comedies I have run across. It should win the Putz Puller Prize for Parody. The absurd starts with Dr. Jeykl snorting his powder and turning into a sex fiend.He is pursued by libido driven nurse early in the movie in one of the funniest scenes of the movie. Pay attention to the hospital PA system in the background; rather like the system in MASH. The final scene with Hyde accepting the award has had me laughing for years. Oh... and the ""Busty Nurse"" is Cassandra Peterson, who went on to become Elvira, Mistress of the Dark. <br /><br />If you liked the Mel Brooks classic movies (Blazing Saddles, etc.), I suspect you\'d like this one.<br /><br />Damn shame you can\'t get it on DVD anywhere.<br /><br />It\'s available on DVD now !!!!! Good thing DVDs don\'t wear out from use !!!!!"',
 1)

In [18]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
            return None, None
    return docs, y

In [19]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error="ignore",
                         n_features=2**21,
                         preprocessor=None,
                        tokenizer=tokenizer)

clf = SGDClassifier(loss="log", random_state=1, n_iter=1)
doc_stream = stream_docs(path="./movie_data.csv")

In [20]:
import pyprind
pbar = pyprind.ProgBar(45)

classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:16


In [21]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print("Accuracy: %.3f" % clf.score(X_test, y_test))

Accuracy: 0.872


In [22]:
clf = clf.partial_fit(X_test, y_test)

In [27]:
import pickle
import os 

dest = os.path.join("movieclassifier", "pkl_objects")
if not os.path.exists(dest):
    os.makedirs(dest)
pickle.dump(stop, open(os.path.join(dest, "stopwords.pkl"), "wb"))
pickle.dump(clf, open(os.path.join(dest, "classifier.pkl"), "wb"), protocol=4)

In [30]:
import pickle
import re
import os
from vectorizer import vect
cl = pickle.load(open(os.path.join("movieclassifier", "pkl_objects", "classifier.pkl"), "rb"))

In [None]:
import numpy as np
label = {0: "negative", 1:"positive"}
example = ["I love this movie"]
X = vect.transform(example)
print("Prediction: %s\nProbability: %.2f%%" %\(label[clf]))