**[ Section 1.0 ] Import packages and workspase setting**

In [1]:
## Features
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import HashingVectorizer
from gensim.models import Doc2Vec

## Models
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

## Tools for loading/ evaluation
from sklearn.datasets import load_files
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn import utils

## Other modules
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models.doc2vec import TaggedDocument
import numpy as np
import gensim
import re

**[ Section 1.1 ] Loading data**

In [2]:
news20groups = load_files('./20_newsgroup', encoding='latin-1', shuffle = True, random_state=42)
print ('Number of documents  = ' , len(news20groups.data))
print ('Number of categories = ', len(news20groups.target_names))

Number of documents  =  19997
Number of categories =  20


**[ Section 1.2 ] Important functions and declarations**

In [3]:
## Scoring metrics array.
scoring = {'precision': 'precision_macro',
           'recall': 'recall_macro',
           'F1_score': 'f1_macro',
           'accuracy': make_scorer(accuracy_score)}

def printScores (scores):
    print ('recall = ', np.mean(scores['test_recall']))
    print ('precision = ', np.mean(scores['test_precision']))
    print ('F1_score = ', np.mean(scores['test_F1_score']))
    print ('accuracy = ', np.mean(scores['test_accuracy']))

**[ Section 2.1 ] Bag of words feature using CountVectorizer**

SVM with linear kernel (SGDClassifier) is used to clasify which is the last argument to the pipeline.

In [4]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])

scores = cross_validate(text_clf, news20groups.data, news20groups.target, scoring=scoring, cv=10, return_train_score=False)
printScores(scores)



recall =  0.9123494949494949
precision =  0.9249523597217981
F1_score =  0.9096568264778186
accuracy =  0.912337468734367


**[ Section 2.2 ] TF-iDF feature using TfidfTransformer (with Naive bayes)**

Count of BoW model is used as the first transformer. Second transformer is the Tf-idf and Naive bayes classifier (MultinomialNB) is the last argument to the pipeline.

In [5]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),])

scores = cross_validate(text_clf, news20groups.data, news20groups.target, scoring=scoring, cv=10, return_train_score=False)
printScores(scores)

recall =  0.88735
precision =  0.8882495071171721
F1_score =  0.8852631713940566
accuracy =  0.8873319409704852


**[ Section 2.3 ] TF-iDF feature using TfidfTransformer (with SVM)**

Use of SGDClassifier (Linear SVM with Stochastic gradient decent optimizer)

In [6]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),])

scores = cross_validate(text_clf, news20groups.data, news20groups.target, scoring=scoring, cv=10, return_train_score=False)
printScores(scores)



recall =  0.9362994949494947
precision =  0.936986663073885
F1_score =  0.934926206514114
accuracy =  0.9362900450225112


**[ Section 2.4 ] TF-iDF feature using TfidfTransformer (with Random forest)**

Random forest classifier with default parameter is used to classify

In [7]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier()),])

scores = cross_validate(text_clf, news20groups.data, news20groups.target, scoring=scoring, cv=10, return_train_score=False)
printScores(scores)



recall =  0.8274984848484849
precision =  0.8274553438548822
F1_score =  0.8255827077650626
accuracy =  0.8274755127563782


**[ Section 2.5 ] Hashing vector feature (with SVM)**

In [8]:
text_clf = Pipeline([
    ('hash', HashingVectorizer(n_features=2 ** 20)),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),])

scores = cross_validate(text_clf, news20groups.data, news20groups.target, scoring=scoring, cv=10, return_train_score=False)
printScores(scores)



recall =  0.9291494949494948
precision =  0.9339017025226699
F1_score =  0.9279595188641487
accuracy =  0.9291390445222613


**[ Section 2.6 ] Hashing vector feature integrated to TF-Idf (with SVM)**

SVM Classifier is used.

In [9]:
text_clf = Pipeline([
    ('hash', HashingVectorizer(n_features=2 ** 20)),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),])

scores = cross_validate(text_clf, news20groups.data, news20groups.target, scoring=scoring, cv=10, return_train_score=False)
printScores(scores)



recall =  0.936299494949495
precision =  0.9369787419537339
F1_score =  0.9349424652994628
accuracy =  0.9362900700350176


**[ Section 2.7 ] Doc2vec model with logistic regressoin**

In [10]:
# Doc2Vec model training example were taken from following article.
# https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568
def label_sentences(corpus, label_type):
    """
    Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
    We do this by using the TaggedDocument method. The format will be "TRAIN_i" or "TEST_i" where "i" is
    a dummy index of the post.
    """
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(TaggedDocument(v.split(), [label]))
    return labeled

X_train, X_test, y_train, y_test = train_test_split(news20groups.data, news20groups.target, random_state=0, test_size=0.3)
X_train = label_sentences(X_train, 'Train')
X_test = label_sentences(X_test, 'Test')
all_data = X_train + X_test

In [11]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, alpha=0.065, min_alpha=0.065)
model_dbow.build_vocab([x for x in tqdm(all_data)])

for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|████████████████████████████████████████████████████████████████████████████████████████| 19997/19997 [00:00<00:00, 1000447.27it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 19997/19997 [00:00<00:00, 697510.10it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 19997/19997 [00:00<00:00, 661410.75it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 19997/19997 [00:00<00:00, 518163.04it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 19997/19997 [00:00<00:00, 647586.78it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 19997/19997 [00:00<00:00, 714631.98it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 19997/19997 [00:00<00:00, 925970.67it/s]
100%|████████████████████████████████████

In [12]:
def get_vectors(model, corpus_size, vectors_size, vectors_type):
    """
    Get vectors from trained doc2vec model
    :param doc2vec_model: Trained Doc2Vec model
    :param corpus_size: Size of the data
    :param vectors_size: Size of the embedding vectors
    :param vectors_type: Training or Testing vectors
    :return: list of vectors
    """
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors
    
train_vectors_dbow = get_vectors(model_dbow, len(X_train), 300, 'Train')
test_vectors_dbow = get_vectors(model_dbow, len(X_test), 300, 'Test')

In [13]:
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(train_vectors_dbow, y_train)
logreg = logreg.fit(train_vectors_dbow, y_train)
y_pred = logreg.predict(test_vectors_dbow)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=news20groups.target_names))



accuracy 0.8425
                          precision    recall  f1-score   support

             alt.atheism       0.76      0.77      0.77       303
           comp.graphics       0.78      0.77      0.78       315
 comp.os.ms-windows.misc       0.85      0.83      0.84       320
comp.sys.ibm.pc.hardware       0.84      0.83      0.83       314
   comp.sys.mac.hardware       0.87      0.91      0.89       291
          comp.windows.x       0.87      0.87      0.87       296
            misc.forsale       0.84      0.86      0.85       304
               rec.autos       0.90      0.84      0.87       309
         rec.motorcycles       0.93      0.91      0.92       285
      rec.sport.baseball       0.96      0.97      0.97       299
        rec.sport.hockey       0.98      0.97      0.98       311
               sci.crypt       0.91      0.89      0.90       286
         sci.electronics       0.80      0.82      0.81       272
                 sci.med       0.89      0.89      0.89    

**[ Section 2.8 ] Doc2vec model with SVM classifier**

In [14]:
sgd = SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)
sgd.fit(train_vectors_dbow, y_train)
sgd = sgd.fit(train_vectors_dbow, y_train)
y_pred = sgd.predict(test_vectors_dbow)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=news20groups.target_names))



accuracy 0.8721666666666666
                          precision    recall  f1-score   support

             alt.atheism       0.75      0.87      0.81       303
           comp.graphics       0.82      0.81      0.82       315
 comp.os.ms-windows.misc       0.89      0.84      0.87       320
comp.sys.ibm.pc.hardware       0.85      0.89      0.87       314
   comp.sys.mac.hardware       0.87      0.95      0.91       291
          comp.windows.x       0.93      0.92      0.93       296
            misc.forsale       0.88      0.84      0.86       304
               rec.autos       0.93      0.92      0.92       309
         rec.motorcycles       0.96      0.96      0.96       285
      rec.sport.baseball       0.96      0.98      0.97       299
        rec.sport.hockey       0.94      0.99      0.96       311
               sci.crypt       0.90      0.97      0.93       286
         sci.electronics       0.88      0.82      0.85       272
                 sci.med       0.94      0.90  