In [1]:
from sklearn.datasets import fetch_20newsgroups_vectorized
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils.multiclass import unique_labels

Plan: build multinomial logistic regression for 5 cats ( 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'na'). Compare to SGD with hinge loss

In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline

In [3]:
remove = ('headers', 'footers', 'quotes')

train = fetch_20newsgroups(subset='train', remove=remove)
test = fetch_20newsgroups(subset='test', remove=remove)

In [4]:
{train.target_names[i]: i for i in range(len(train.target_names))}

{'alt.atheism': 0,
 'comp.graphics': 1,
 'comp.os.ms-windows.misc': 2,
 'comp.sys.ibm.pc.hardware': 3,
 'comp.sys.mac.hardware': 4,
 'comp.windows.x': 5,
 'misc.forsale': 6,
 'rec.autos': 7,
 'rec.motorcycles': 8,
 'rec.sport.baseball': 9,
 'rec.sport.hockey': 10,
 'sci.crypt': 11,
 'sci.electronics': 12,
 'sci.med': 13,
 'sci.space': 14,
 'soc.religion.christian': 15,
 'talk.politics.guns': 16,
 'talk.politics.mideast': 17,
 'talk.politics.misc': 18,
 'talk.religion.misc': 19}

In [5]:
def map_target(x):
    if x == 11:
        return x
    elif x == 12:
        return x
    elif x == 13:
        return x
    elif x == 14:
        return x
    else:
        return -1

In [6]:
vmap = np.vectorize(map_target)

In [7]:
train.new_target = vmap(train.target)

In [8]:
test.new_target = vmap(test.target)

In [9]:
logreg_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression(multi_class='multinomial', solver='lbfgs')),
])

logreg_pipeline.fit(train.data, train.new_target)



Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
   

In [10]:
preds = logreg_pipeline.predict(test.data)
preds_proba = logreg_pipeline.predict_proba(test.data)

In [11]:
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, roc_curve, confusion_matrix

In [12]:
f1_score(test.new_target, preds, average='weighted')

0.7977787272552624

In [13]:
confusion_matrix(test.new_target, preds)

array([[5942,    1,    4,    1,    5],
       [ 273,  120,    3,    0,    0],
       [ 344,    1,   48,    0,    0],
       [ 302,    0,    1,   93,    0],
       [ 270,    0,    1,    1,  122]])

In [14]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge'))
])

In [15]:
pipeline.fit(train.data, train.new_target)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                               max_iter=1000,

In [16]:
preds = pipeline.predict(test.data)

In [17]:
f1_score(test.new_target, preds, average='weighted')

0.8587850016405336

In [18]:
confusion_matrix(test.new_target, preds)

array([[5923,    4,   12,    5,    9],
       [ 193,  201,    2,    0,    0],
       [ 284,    3,  101,    2,    3],
       [ 211,    0,    3,  181,    1],
       [ 177,    0,    1,    3,  213]])