"""
Updated 2019.09.13

@author: Kristian Hovde Liland
"""

"""
EXERCISE: Exchange Logistic regression with Naïve Bayes in sentiment analysis
(1) Use the code from the lecture to
   - Read the IMDb review dataframe
   - Run the preprocessor
   - Load the Pickled object (gs_lr_tfidf) from file
(2) Find the optimal combination of text processing from this object.
(3) Create a pipeline with the text processing steps, but add a 
(Gaussian) Naïve Bayes model at the end instead of logistic regression.
   - Fit the model on the training data
   - Check performance on test data
"""

"""
BONUS EXERCISE: Latent Dirichlet Allocation playground
Three parameters in the code from the lecture that will have a great impact on
the results:
    - max_df: maximum document frequency of words
    - max_features: maximum vocabulary size
    - n_components: number of topics

What happens to the topics if you reduce the number of components to 3?

Make an extreme case by reducing the vocabulary to a few words per topic, 
e.g. a vocabulary of 5 words x 3 topics = 15 (max_features). How does it
affect the resulting categories, and can you still interpret the 
accompanying words?
"""

# Topics

Cleaning and preparing text data

Building feature vectors from text documents

Training a machine learning model to classify positive and negative movie reviews


# Preparing the IMDb movie review data for text processing

Obtaining the IMDb movie review dataset

The IMDb movie review set can be downloaded from http://ai.stanford.edu/~amaas/data/sentiment/. After downloading the dataset, decompress the files.

0) Use the code in the following cells to retreive and extact automatically.

A) If you are working with Linux or MacOS X, open a new terminal window cd into the download directory and execute

tar -zxf aclImdb_v1.tar.gz

B) If you are working with Windows, download an archiver such as 7Zip to extract the files from the download archive.

Optional code to download and unzip the dataset via Python:

In [None]:
import os
import sys
import tarfile
import time


source = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
target = '/home/abhishesh/anaconda3/envs/anaconda/h2019/aclImdb_v1.tar.gz'


def reporthook(count, block_size, total_size):
    global start_time
    if count == 0:
        start_time = time.time()
        return
    duration = time.time() - start_time
    if duration == 0:
        duration = 10**-3
    progress_size = int(count * block_size)
    speed = progress_size / (1024.**2 * duration)
    percent = count * block_size * 100. / total_size
    sys.stdout.write("\r%d%% | %d MB | %.2f MB/s | %d sec elapsed" %
                    (percent, progress_size / (1024.**2), speed, duration))
    sys.stdout.flush()

In [None]:
# This download takes a couple of seconds at NMBU (<30)
if not os.path.isdir('aclImdb') and not os.path.isfile('aclImdb_v1.tar.gz'):
    
    if (sys.version_info < (3, 0)):
        import urllib
        urllib.urlretrieve(source, target, reporthook)
    
    else:
        import urllib.request
        urllib.request.urlretrieve(source, target, reporthook)

In [None]:
# The extraction can take several minutes as all 50,000 reviews are stored as separate text files
# (101,111 files). 
# Extracting to a synced folder (Dropbox, Google Drive, OneDrive, ...) may slow the process further.
if not os.path.isdir('https://drive.google.com/drive/u/0/folders/1dDGocaPbO-Nk8VbugTB_NS4HDja77De8/aclImdb'):

    with tarfile.open(target, 'r:gz') as tar:
        tar.extractall()

# Preprocessing the movie dataset into more convenient format
Read all review files and append them sequentially into a Pandas dataframe.

In [1]:
#import pyprind       # pip install pyprind, if you haven't used it before
import pandas as pd
import os

# change the `basepath` to the directory of the
# unzipped movie dataset

basepath = '/home/abhishesh/anaconda3/envs/anaconda/h2019/aclImdb'

labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path, file), 
                      'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], 
                           ignore_index=True)
            pbar.update()
df.columns = ['review', 'sentiment']

NameError: name 'pyprind' is not defined

# Read The CSV file from the database in PC

In [2]:
import pandas as pd

df = pd.read_csv('/home/abhishesh/anaconda3/envs/anaconda/h2019/python-machine-learning-book-2nd-edition/code/ch08/movie_data.csv', encoding='utf-8')
df.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [3]:
df.shape

(50000, 2)

# """ EXERCISE: Exchange Logistic regression with Naïve Bayes in sentiment analysis 

(1) Use the code from the lecture to Read the IMDb review dataframe , Run the preprocessor, Load the Pickled object (gs_lr_tfidf) from file 

(2) Find the optimal combination of text processing from this object. 

(3) Create a pipeline with the text processing steps, but add a (Gaussian) Naïve Bayes model at the end instead of logistic regression.

Fit the model on the training data

Check performance on test data """


# Run the preprocessor


1. Clean the text data
2. Use BAG-of-WORDS methods to create the features 
    - n-grams & K-mers 
    - use fit_transform
    - Optimization of data using tf-idf
    - Tokenization
3. Identifying STOP-WORDS using NLTK

In [13]:
#Example before using func 'preprocessor'
(df.loc[5, 'review'][-70:])

"don't like it. Hey there were tons who hated it and tons who loved it."

# 1. Clean the text data

In [14]:
import re

In [15]:
def preprocessor(text):
    # Regular expression for HTML tags
    text = re.sub('<[^>]*>', '', text)
    
    # Most typical emoticons (smileys)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    
    # Remove all non-word characters, convert to lower-case and add possible emoticons to the end.
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

In [17]:
# Example after preprocessor is implemented
preprocessor(df.loc[5, 'review'][-70:])



'don t like it hey there were tons who hated it and tons who loved it '

In [18]:
# This takes a few seconds
df['review'] = df['review'].apply(preprocessor)

In [28]:
df.head()
df['review']
df.shape()

Unnamed: 0,review,sentiment
0,in 1974 the teenager martha moxley maggie grac...,1
1,ok so i really like kris kristofferson and his...,0
2,spoiler do not read this if you think about w...,0
3,hi for all the people who have seen this wonde...,1
4,i recently bought the dvd forgetting just how ...,0


# 2. Use BAG-of-WORDS methods to create the features

In [24]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer


In [35]:
df_rev = df.loc[:, 'review']
df_rev.head()

0    in 1974 the teenager martha moxley maggie grac...
1    ok so i really like kris kristofferson and his...
2     spoiler do not read this if you think about w...
3    hi for all the people who have seen this wonde...
4    i recently bought the dvd forgetting just how ...
Name: review, dtype: object

In [36]:

df_sent = df.loc[:, 'sentiment']
df_sent.head()

0    1
1    0
2    0
3    1
4    0
Name: sentiment, dtype: int64

In [50]:
count = CountVectorizer(ngram_range=[2,2])
bag = count.fit_transform(df_rev)

In [51]:
# Vocabulary with ordering (as dictionary)
#print(count.vocabulary_)
#print(sorted(count.vocabulary_))

In [52]:
print(type(bag))
print(bag.shape)

<class 'scipy.sparse.csr.csr_matrix'>
(50000, 2383728)


In [49]:
#bag.toarray()
#print (bag.toarray())

In [54]:
# Transform tf (which is bag.toarray()) to tf-idf:

from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(use_idf=True, 
                         norm='l2', 
                         smooth_idf=True)

np.set_printoptions(precision=2)
bag.toarray()
sorted(count.vocabulary_)
tfidf.fit_transform(count.fit_transform(df_rev)).toarray() # Word in many documents => less variation in tf-idf

#print(bag.toarray())
#print(sorted(count.vocabulary_))
#print(tfidf.fit_transform(count.fit_transform(docs))
#      .toarray()) # Word in many documents => less variation in tf-idf

MemoryError: 

In [55]:
import pandas as pd

df = pd.read_csv('/home/abhishesh/anaconda3/envs/anaconda/h2019/python-machine-learning-book-2nd-edition/code/ch08/movie_data.csv', encoding='utf-8')
df.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [59]:
import nltk

# Update to most resent stop-words
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/abhishesh/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [61]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

# Define basic tokenizer and Porter stemmer version
def tokenizer(text):
    return text.split()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [62]:
from nltk.corpus import stopwords

# Combine tokenizer with Porter stemmer and stop-word removal
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')
if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

In [63]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [64]:
stops = []
for s in stop:
    stops.append(tokenizer(s)[0])
stopsPorter = []
for s in stop:
    stopsPorter.append(tokenizer_porter(s)[0])

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

# TfidfVectorizer combines CountVectorizer and TfidTransformer with a single function.
tfidf = TfidfVectorizer(strip_accents=None, # Already preprocessed
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stops, None], # Not this time, but use idf with normalization
               'vect__tokenizer': [tokenizer],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stops, None], # Not this time
               'vect__tokenizer': [tokenizer],
               'vect__use_idf':[False],       # Raw counts without normalization 
               'vect__norm':[None],           # --------------||----------------
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
             {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stopsPorter, None], # Not this time, but use idf with normalization
               'vect__tokenizer': [tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stopsPorter, None], # Not this time
               'vect__tokenizer': [tokenizer_porter],
               'vect__use_idf':[False],       # Raw counts without normalization 
               'vect__norm':[None],           # --------------||----------------
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]}]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0, solver='saga'))])
# Solver specified to silence warning and to enable l1 regularization

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=2,
                           n_jobs=1) # Number of jobs different from 1 sometimes crashes on Windows.

In [None]:
# The fitting of 2*2*2*3*5*2 models took around 30-60 minutes to fit in 2018. In 2019 it takes several hours. :(.
# Lowering the number of samples or parameters will make it quicker, but may reduce the performance greatly.
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] vect__ngram_range=(1, 1), clf__C=1.0, vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  vect__ngram_range=(1, 1), clf__C=1.0, vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   15.3s remaining:    0.0s


[CV]  vect__ngram_range=(1, 1), clf__C=1.0, vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such

[CV]  vect__ngram_range=(1, 1), clf__C=1.0, vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such

[CV]  vect__ngram_range=(1, 1), clf__C=1.0, vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such

[CV]  vect__ngram_range=(1, 1), clf__C=1.0, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, clf__penalty=l2, total=   6.3s
[CV] vect__ngram_range=(1, 1), clf__C=1.0, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, clf__penalty=l2 
[CV]  vect__ngram_range=(1, 1), clf__C=1.0, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, clf__penalty=l2, total=   5.9s
[CV] vect__ngram_range=(1, 1), clf__C=1.0, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, clf__penalty=l2 
[CV]  vect__ngram_range=(1, 1), clf__C=1.0, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, clf__penalty=l2, total=   5.7s
[CV] vect__ngram_range=(1, 1), clf__C=1.0, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, clf__penalty=l2 
[CV]  vect__ngram_range=(1, 1), clf__C=1.0, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>



[CV]  vect__ngram_range=(1, 1), clf__C=10.0, vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'suc

[CV]  vect__ngram_range=(1, 1), clf__C=10.0, vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'suc

[CV]  vect__ngram_range=(1, 1), clf__C=10.0, vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'suc

[CV]  vect__ngram_range=(1, 1), clf__C=10.0, vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'suc

[CV]  vect__ngram_range=(1, 1), clf__C=100.0, vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'su

[CV]  vect__ngram_range=(1, 1), clf__C=100.0, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, clf__penalty=l1, total=10.4min
[CV] vect__ngram_range=(1, 1), clf__C=100.0, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, clf__penalty=l1 
[CV]  vect__ngram_range=(1, 1), clf__C=100.0, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, clf__penalty=l1, total=10.1min
[CV] vect__ngram_range=(1, 1), clf__C=100.0, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, clf__penalty=l1 
[CV]  vect__ngram_range=(1, 1), clf__C=100.0, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, clf__penalty=l1, total= 9.8min
[CV] vect__ngram_range=(1, 1), clf__C=100.0, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, clf__penalty=l1 
[CV]  vect__ngram_range=(1, 1), clf__C=100.0, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0

[CV]  vect__ngram_range=(1, 1), clf__C=100.0, vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'su

[CV]  vect__ngram_range=(1, 1), clf__C=100.0, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, clf__penalty=l2, total=   9.7s
[CV] vect__ngram_range=(1, 1), clf__C=100.0, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, clf__penalty=l2 
[CV]  vect__ngram_range=(1, 1), clf__C=100.0, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, clf__penalty=l2, total=   9.3s
[CV] vect__ngram_range=(1, 1), clf__C=100.0, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, clf__penalty=l2 
[CV]  vect__ngram_range=(1, 1), clf__C=100.0, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, clf__penalty=l2, total=   9.3s
[CV] vect__ngram_range=(1, 1), clf__C=100.0, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, clf__penalty=l2 
[CV]  vect__ngram_range=(1, 1), clf__C=100.0, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0

[CV]  vect__ngram_range=(1, 1), clf__C=1.0, vect__norm=None, vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'othe

[CV]  vect__ngram_range=(1, 1), clf__C=1.0, vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, vect__use_idf=False, clf__penalty=l1, total= 5.8min
[CV] vect__ngram_range=(1, 1), clf__C=1.0, vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, vect__use_idf=False, clf__penalty=l1 
[CV]  vect__ngram_range=(1, 1), clf__C=1.0, vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, vect__use_idf=False, clf__penalty=l1, total= 6.3min
[CV] vect__ngram_range=(1, 1), clf__C=1.0, vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, vect__use_idf=False, clf__penalty=l1 
[CV]  vect__ngram_range=(1, 1), clf__C=1.0, vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, vect__use_idf=False, clf__penalty=l1, total= 6.3min
[CV] vect__ngram_range=(1, 1), clf__C=1.0, vect__norm=None, vect__stop

[CV]  vect__ngram_range=(1, 1), clf__C=1.0, vect__norm=None, vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'othe

[CV]  vect__ngram_range=(1, 1), clf__C=1.0, vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, vect__use_idf=False, clf__penalty=l2, total=   9.1s
[CV] vect__ngram_range=(1, 1), clf__C=1.0, vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, vect__use_idf=False, clf__penalty=l2 
[CV]  vect__ngram_range=(1, 1), clf__C=1.0, vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, vect__use_idf=False, clf__penalty=l2, total=   9.4s
[CV] vect__ngram_range=(1, 1), clf__C=1.0, vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, vect__use_idf=False, clf__penalty=l2 
[CV]  vect__ngram_range=(1, 1), clf__C=1.0, vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, vect__use_idf=False, clf__penalty=l2, total=   9.4s
[CV] vect__ngram_range=(1, 1), clf__C=1.0, vect__norm=None, vect__stop

[CV]  vect__ngram_range=(1, 1), clf__C=10.0, vect__norm=None, vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'oth

[CV]  vect__ngram_range=(1, 1), clf__C=10.0, vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, vect__use_idf=False, clf__penalty=l1, total=36.4min
[CV] vect__ngram_range=(1, 1), clf__C=10.0, vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, vect__use_idf=False, clf__penalty=l1 
[CV]  vect__ngram_range=(1, 1), clf__C=10.0, vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, vect__use_idf=False, clf__penalty=l1, total=38.2min
[CV] vect__ngram_range=(1, 1), clf__C=10.0, vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, vect__use_idf=False, clf__penalty=l1 
[CV]  vect__ngram_range=(1, 1), clf__C=10.0, vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, vect__use_idf=False, clf__penalty=l1, total=38.1min
[CV] vect__ngram_range=(1, 1), clf__C=10.0, vect__norm=None, vect

[CV]  vect__ngram_range=(1, 1), clf__C=10.0, vect__norm=None, vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'oth

[CV]  vect__ngram_range=(1, 1), clf__C=10.0, vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, vect__use_idf=False, clf__penalty=l2, total=   9.1s
[CV] vect__ngram_range=(1, 1), clf__C=10.0, vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, vect__use_idf=False, clf__penalty=l2 
[CV]  vect__ngram_range=(1, 1), clf__C=10.0, vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, vect__use_idf=False, clf__penalty=l2, total=   9.0s
[CV] vect__ngram_range=(1, 1), clf__C=10.0, vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, vect__use_idf=False, clf__penalty=l2 
[CV]  vect__ngram_range=(1, 1), clf__C=10.0, vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, vect__use_idf=False, clf__penalty=l2, total=   9.0s
[CV] vect__ngram_range=(1, 1), clf__C=10.0, vect__norm=None, vect

[CV]  vect__ngram_range=(1, 1), clf__C=100.0, vect__norm=None, vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'ot

[CV]  vect__ngram_range=(1, 1), clf__C=100.0, vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, vect__use_idf=False, clf__penalty=l1, total=45.5min
[CV] vect__ngram_range=(1, 1), clf__C=100.0, vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, vect__use_idf=False, clf__penalty=l1 
[CV]  vect__ngram_range=(1, 1), clf__C=100.0, vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, vect__use_idf=False, clf__penalty=l1, total=46.2min
[CV] vect__ngram_range=(1, 1), clf__C=100.0, vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, vect__use_idf=False, clf__penalty=l1 
[CV]  vect__ngram_range=(1, 1), clf__C=100.0, vect__norm=None, vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x7fb11ccf1950>, vect__use_idf=False, clf__penalty=l1, total=46.2min
[CV] vect__ngram_range=(1, 1), clf__C=100.0, vect__norm=None