In [1]:
from nltk.corpus import movie_reviews as mr
from collections import defaultdict

import nltk.corpus
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.model_selection import GridSearchCV
#deprecated: from sklearn.grid_search import GridSearchCV

#modules for features creation in texts
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline

#### CountVectorizer => Convert a collection of text documents to a matrix of token counts <br>TfidfTransformer => Transform a count matrix to a normalized tf or tf-idf representation <br> TfidfVectorizer => Convert a collection of raw documents to a matrix of TF-IDF features. Equivalent to CountVectorizer followed by TfidfTransformer.
##### SGDClassifier => Linear classifiers (SVM, logistic regression, a.o.) with SGD training. <br> LinearSVC => Linear Support Vector Classification <br> Pipeline => Pipeline of transforms with a final estimator.

In [10]:
print dir(mr)

In [11]:
negids = mr.fileids('neg')
posids = mr.fileids('pos')

negfeats = [" ".join(mr.words(fileids=[f])) for f in negids]
posfeats = [" ".join(mr.words(fileids=[f])) for f in posids]

texts = negfeats + posfeats
labels = [0] * len(negfeats) + [1] * len(posfeats)

print texts[1]

In [12]:
def text_classifier(vectorizer, classifier):
    return Pipeline(
            [("vectorizer", vectorizer),
             ("classifier", classifier)]
        )

In [13]:
for vect in [CountVectorizer, TfidfVectorizer]:
    score = cross_val_score(text_classifier(vect(),  LogisticRegression()), texts, labels)
    print vect, " mean: %f, std: %f " %(score.mean(), score.std())

In [14]:
print "min_df=10: ", cross_val_score(text_classifier(CountVectorizer(min_df=10),  LogisticRegression()), texts, labels)
print "min_df=50: ", cross_val_score(text_classifier(CountVectorizer(min_df=50),  LogisticRegression()), texts, labels)

In [22]:
for cls in [LogisticRegression, LinearSVC, SGDClassifier]:
    print cls, cross_val_score(text_classifier(CountVectorizer(),  cls()), texts, labels).mean()

In [17]:
stop_words =  nltk.corpus.stopwords.words('english')
print type(stop_words)

In [18]:
print stop_words
print len(stop_words)

In [19]:
print CountVectorizer().get_stop_words()

In [20]:
print "corpora stop words ", cross_val_score(text_classifier(CountVectorizer(stop_words = stop_words, analyzer = 'word'),  
                                                             cls()), texts, labels).mean()
print "sklearn stop words ", cross_val_score(text_classifier(CountVectorizer(stop_words = "english"),  
                                                             cls()), texts, labels).mean()

In [21]:
print "word bigram: ", cross_val_score(text_classifier(CountVectorizer(ngram_range=(1, 2)),  
                                                       LogisticRegression()), texts, labels)
print "character n-gram: ", cross_val_score(text_classifier(CountVectorizer(analyzer='char_wb', ngram_range=(1, 5)),  
                                                            LogisticRegression()), texts, labels)

In [9]:
corpus = [
    "The blue dog Blue",
    "Green the green cat",
    "The green mouse",
]

# CountVectorizer character 2-grams with word boundaries
vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(1, 5), min_df=1) 
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names()