In [1]:
from nltk.corpus import movie_reviews as mr
from collections import defaultdict

import nltk.corpus
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.model_selection import GridSearchCV
#deprecated: from sklearn.grid_search import GridSearchCV

#modules for features creation in texts
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline

#### CountVectorizer => Convert a collection of text documents to a matrix of token counts <br>TfidfTransformer => Transform a count matrix to a normalized tf or tf-idf representation <br> TfidfVectorizer => Convert a collection of raw documents to a matrix of TF-IDF features. Equivalent to CountVectorizer followed by TfidfTransformer.
##### SGDClassifier => Linear classifiers (SVM, logistic regression, a.o.) with SGD training. <br> LinearSVC => Linear Support Vector Classification <br> Pipeline => Pipeline of transforms with a final estimator.

In [10]:
print dir(mr)

['CorpusView', '_LazyCorpusLoader__args', '_LazyCorpusLoader__kwargs', '_LazyCorpusLoader__name', '_LazyCorpusLoader__reader_cls', '__class__', '__delattr__', '__dict__', '__doc__', '__format__', '__getattribute__', '__hash__', '__init__', '__module__', '__name__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__unicode__', '__weakref__', '_add', '_get_root', '_init', '_read_para_block', '_read_sent_block', '_read_word_block', '_resolve', 'abspath', 'abspaths', 'categories', 'citation', 'encoding', 'ensure_loaded', 'fileids', 'license', 'open', 'paras', 'raw', 'readme', 'root', 'sents', 'subdir', 'unicode_repr', 'words']


In [11]:
negids = mr.fileids('neg')
posids = mr.fileids('pos')

negfeats = [" ".join(mr.words(fileids=[f])) for f in negids]
posfeats = [" ".join(mr.words(fileids=[f])) for f in posids]

texts = negfeats + posfeats
labels = [0] * len(negfeats) + [1] * len(posfeats)

print texts[1]

the happy bastard ' s quick movie review damn that y2k bug . it ' s got a head start in this movie starring jamie lee curtis and another baldwin brother ( william this time ) in a story regarding a crew of a tugboat that comes across a deserted russian tech ship that has a strangeness to it when they kick the power back on . little do they know the power within . . . going for the gore and bringing on a few action sequences here and there , virus still feels very empty , like a movie going for all flash and no substance . we don ' t know why the crew was really out in the middle of nowhere , we don ' t know the origin of what took over the ship ( just that a big pink flashy thing hit the mir ) , and , of course , we don ' t know why donald sutherland is stumbling around drunkenly throughout . here , it ' s just " hey , let ' s chase these people around with some robots " . the acting is below average , even from the likes of curtis . you ' re more likely to get a kick out of her work i

In [12]:
def text_classifier(vectorizer, classifier):
    return Pipeline(
            [("vectorizer", vectorizer),
             ("classifier", classifier)]
        )

In [13]:
for vect in [CountVectorizer, TfidfVectorizer]:
    score = cross_val_score(text_classifier(vect(),  LogisticRegression()), texts, labels)
    print vect, " mean: %f, std: %f " %(score.mean(), score.std())

<class 'sklearn.feature_extraction.text.CountVectorizer'>  mean: 0.836022, std: 0.015309 
<class 'sklearn.feature_extraction.text.TfidfVectorizer'>  mean: 0.813511, std: 0.010356 


In [14]:
print "min_df=10: ", cross_val_score(text_classifier(CountVectorizer(min_df=10),  LogisticRegression()), texts, labels)
print "min_df=50: ", cross_val_score(text_classifier(CountVectorizer(min_df=50),  LogisticRegression()), texts, labels)

min_df=10:  [ 0.81287425  0.83333333  0.84534535]
min_df=50:  [ 0.80239521  0.81831832  0.81531532]


In [22]:
for cls in [LogisticRegression, LinearSVC, SGDClassifier]:
    print cls, cross_val_score(text_classifier(CountVectorizer(),  cls()), texts, labels).mean()

<class 'sklearn.linear_model.logistic.LogisticRegression'> 0.836021650393
<class 'sklearn.svm.classes.LinearSVC'> 0.827517637398
<class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'> 0.789009068949


In [17]:
stop_words =  nltk.corpus.stopwords.words('english')
print type(stop_words)

<type 'list'>


In [18]:
print stop_words
print len(stop_words)

[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u

In [19]:
print CountVectorizer().get_stop_words()

None


In [20]:
print "corpora stop words ", cross_val_score(text_classifier(CountVectorizer(stop_words = stop_words, analyzer = 'word'),  
                                                             cls()), texts, labels).mean()
print "sklearn stop words ", cross_val_score(text_classifier(CountVectorizer(stop_words = "english"),  
                                                             cls()), texts, labels).mean()

corpora stop words  0.821511631392
sklearn stop words  0.806008103912


In [21]:
print "word bigram: ", cross_val_score(text_classifier(CountVectorizer(ngram_range=(1, 2)),  
                                                       LogisticRegression()), texts, labels)
print "character n-gram: ", cross_val_score(text_classifier(CountVectorizer(analyzer='char_wb', ngram_range=(1, 5)),  
                                                            LogisticRegression()), texts, labels)

word bigram:  [ 0.81137725  0.84684685  0.85285285]
character n-gram:  [ 0.80838323  0.81381381  0.79129129]


In [9]:
corpus = [
    "The blue dog Blue",
    "Green the green cat",
    "The green mouse",
]

# CountVectorizer character 2-grams with word boundaries
vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(1, 5), min_df=1) 
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names()

[u' ',
 u' b',
 u' bl',
 u' blu',
 u' blue',
 u' c',
 u' ca',
 u' cat',
 u' cat ',
 u' d',
 u' do',
 u' dog',
 u' dog ',
 u' g',
 u' gr',
 u' gre',
 u' gree',
 u' m',
 u' mo',
 u' mou',
 u' mous',
 u' t',
 u' th',
 u' the',
 u' the ',
 u'a',
 u'at',
 u'at ',
 u'b',
 u'bl',
 u'blu',
 u'blue',
 u'blue ',
 u'c',
 u'ca',
 u'cat',
 u'cat ',
 u'd',
 u'do',
 u'dog',
 u'dog ',
 u'e',
 u'e ',
 u'ee',
 u'een',
 u'een ',
 u'en',
 u'en ',
 u'g',
 u'g ',
 u'gr',
 u'gre',
 u'gree',
 u'green',
 u'h',
 u'he',
 u'he ',
 u'l',
 u'lu',
 u'lue',
 u'lue ',
 u'm',
 u'mo',
 u'mou',
 u'mous',
 u'mouse',
 u'n',
 u'n ',
 u'o',
 u'og',
 u'og ',
 u'ou',
 u'ous',
 u'ouse',
 u'ouse ',
 u'r',
 u're',
 u'ree',
 u'reen',
 u'reen ',
 u's',
 u'se',
 u'se ',
 u't',
 u't ',
 u'th',
 u'the',
 u'the ',
 u'u',
 u'ue',
 u'ue ',
 u'us',
 u'use',
 u'use ']