In [3]:
from nltk.corpus import movie_reviews as mr
from collections import defaultdict

import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.grid_search import GridSearchCV

# modules for feature creation on texts
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline

#### CountVectorizer => Convert a collection of text documents to a matrix of token counts <br> TfidfTransformer => Transform a count matrix to a normalized tf or tf-idf representation <br> TfidfVectorizer => Convert a collection of raw documents to a matrix of TF-IDF features. Equivalent to CountVectorizer followed by TfidfTransformer <br> SGDClassifier => Linear classifiers (SVM, logistic regression, a.o.) with SGD training <br> LinearSVC => Linear Support Vector Classification <br> Pipeline => Pipeline of transforms with a final estimator

In [5]:
print(mr)
print(mr.__doc__)

In [6]:
print(dir(mr))

##### first way to get positive and negative reviews 

In [5]:
documents = defaultdict(list)

for i in mr.fileids():
    documents[i.split('/')[0]].append(i)

print(documents['pos'][:10]) # first ten pos reviews.
print(" ")
print(documents['neg'][:10]) # first ten neg reviews.

In [7]:
print(" ".join(mr.words(fileids=[documents['neg'][1]])), end=" ")

##### second way to get positive and negative reviews and their labels

In [9]:
negids = mr.fileids('neg')
posids = mr.fileids('pos')

negfeats = [" ".join(mr.words(fileids=[f])) for f in negids]
posfeats = [" ".join(mr.words(fileids=[f])) for f in posids]

texts = negfeats + posfeats
labels = [0] * len(negfeats) + [1] * len(posfeats)

print(texts[1])

In [10]:
print("total amount of reviews: ", len(labels))
print("fraction of class 1 in dataset: ", float(len(posfeats))/len(labels))

In [20]:
token_counts = CountVectorizer()
token_matrix = token_counts.fit_transform(texts)

#### try to select parameters for CountVectorizer

In [12]:
pipeline_ = Pipeline(steps = [("vectorizer", CountVectorizer()), ("classifier", LogisticRegression())])
pipeline_.get_params().keys()

#### select parameters from: 'vectorizer__max_df' : [0.85, 0.9, 0.95, 1.0],     'vectorizer__min_df' : [1, 10, 20, 30],      'vectorizer__ngram_range' : [(1, 1), (1, 2)]
#### vectorizer__max_df - if word appears more than in 85, 90, 95, 100% of documents - discard this word; vectorizer__min_df - if word appears less often than in 1, 10, 20, 30 documents - discard this word; vectorizer__ngram_range - build dictionary using single words or involving bigrames

#### scoring = 'accuracy', 'roc_auc'

In [12]:
parameters_grid = {
    'vectorizer__max_df' : [0.85, 0.9, 0.95, 1.0],
    'vectorizer__min_df' : [1, 10, 20, 30], 
    'vectorizer__ngram_range' : [(1, 1), (1, 2)],
}

In [22]:
%%time
grid_cv = GridSearchCV(pipeline_, parameters_grid, scoring = 'accuracy', cv = 4)
grid_cv.fit(texts, labels)

In [23]:
print(grid_cv.best_score_)
print(grid_cv.best_params_)

In [14]:
grid_cv_ = GridSearchCV(pipeline_, parameters_grid, scoring = 'roc_auc', cv = 4)

In [15]:
%%time
grid_cv_.fit(texts, labels)

In [16]:
print(grid_cv_.best_score_)
print(grid_cv_.best_params_)

#### transforme reviews into features matrix (num of document x num of word) filled with numbers that represents how many times this word occurs in this document

In [36]:
print(type(token_matrix[0]))
print(token_matrix[0])

#### 2000 reviews and 39659 distinct words

In [37]:
print(token_matrix.shape)
print(token_matrix[0].shape)
print(token_matrix[1].shape)

#### unique words in particular document 

In [38]:
print(token_matrix[0].nnz)
print(token_matrix[1].nnz)

In [39]:
print(pd.DataFrame(token_matrix[0].todense()))

In [15]:
print("total amount of words in first review: ")
print(pd.DataFrame(token_matrix[0].todense()).sum(axis=1))

In [16]:
print("unique words in first review: ")
print(pd.DataFrame(token_matrix[0].todense()).astype(bool).sum(axis=1))

In [42]:
print(cross_val_score(Pipeline([('vectorizer',  CountVectorizer()), ('classifier',  LogisticRegression())]), texts, labels))

In [43]:
print(cross_val_score(Pipeline([('vectorizer',  CountVectorizer()), ('classifier',  LogisticRegression())]), texts, labels, 
                      scoring='roc_auc'))

In [44]:
clf_pipeline = Pipeline(
            [("vectorizer", CountVectorizer()),
            ("classifier", LogisticRegression())]
        )


clf_pipeline.fit(texts, labels)

print(clf_pipeline)

In [69]:
print(dir(clf_pipeline))

In [83]:
print(clf_pipeline.steps)

In [86]:
print(clf_pipeline.steps[1][1])

#### get_feature_names()   list of all words

In [22]:
'''token_counts = CountVectorizer()
   token_matrix = token_counts.fit_transform(texts)'''
print(token_counts.get_feature_names()[500:600])

#### Attributes: vocabulary_ : dict  A mapping of word terms to feature indices.

In [25]:
for k in token_counts.get_feature_names()[500:600]:
    print(token_counts.vocabulary_[k], end=" ")

In [19]:
print(len(token_counts.vocabulary_))
print(len(token_counts.get_feature_names()))

In [48]:
labe = clf_pipeline.classes_
print(labe)

#### coefficients of each feature-word

In [87]:
print(clf_pipeline.steps[1][1].coef_ )

In [58]:
classif = LogisticRegression()
classif.fit(token_matrix, labels)

In [61]:
print(classif.coef_)
print(len(classif.coef_[0]))

In [65]:
sorted_coeff = sorted(classif.coef_[0] )
print(sorted_coeff[0:5])
print 
print(sorted_coeff[-5:])

#### find the most important words for positive and negative class - words with absolute biggest values abs(n) 

In [62]:
def most_informative_feature_for_binary_classification(vectorizer, classifier, n=10):
    class_labels = classifier.classes_
    feature_names = vectorizer.get_feature_names()
    topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n]
    topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:]

    for coef, feat in topn_class1:
        print class_labels[0], coef, feat

    print

    for coef, feat in reversed(topn_class2):
        print class_labels[1], coef, feat


most_informative_feature_for_binary_classification(token_counts, classif)

In [67]:
print token_counts.vocabulary_[u'bad']

### supplementary material

#### article from habr site: https://habr.com/post/264339/

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB

In [4]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
print(twenty_train.target_names)

In [5]:
print(len(twenty_train.data))
print(len(twenty_train.filenames))
print(("\n".join(twenty_train.data[0].split("\n")[:3])))
print((twenty_train.target_names[twenty_train.target[0]]))

In [6]:
twenty_train.target[:10]

In [7]:
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

In [12]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

In [13]:
print(count_vect.vocabulary_.get(u'algorithm'))

In [16]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)

In [18]:
clf_ = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf_.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

In [21]:
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()), ])
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
print(np.mean(predicted == twenty_test.target))

In [22]:
text_clf_1 = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)),])
_ = text_clf_1.fit(twenty_train.data, twenty_train.target)
predicted = text_clf_1.predict(docs_test)
print(np.mean(predicted == twenty_test.target)) 

In [24]:
print(metrics.classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names))

In [26]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3),}
gs_clf = GridSearchCV(text_clf_1, parameters, n_jobs=-1)

In [27]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])
print(twenty_train.target_names[gs_clf.predict(['God is love'])])

In [28]:
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))