In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold

from sklearn.naive_bayes import MultinomialNB

from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score, f1_score


# PipeLine

In [11]:
df = pd.read_excel('dataset/20_newgroups.xlsx')
# this is a quick fixup for illegal characters
df = df.applymap(lambda x: bytes(x, "utf-8").decode("unicode_escape") if isinstance(x, str) else x)

In [15]:
X=df['content'].astype(str).values
y=df['target'].astype(str).values
clf = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB())
])
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.2, random_state=1235)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [16]:
(y_pred == y_test).sum() / y_pred.shape[0]

0.8453380468404772

# Classification Report

In [18]:
print(classification_report(y_test, y_pred))

                          precision    recall  f1-score   support

             alt.atheism       0.96      0.90      0.93       100
           comp.graphics       0.73      0.81      0.77       109
 comp.os.ms-windows.misc       0.89      0.26      0.40       123
comp.sys.ibm.pc.hardware       0.55      0.80      0.65       116
   comp.sys.mac.hardware       0.87      0.74      0.80       113
          comp.windows.x       0.74      0.91      0.82       122
            misc.forsale       0.85      0.63      0.73       104
               rec.autos       0.86      0.94      0.90       109
         rec.motorcycles       0.96      0.92      0.94       132
      rec.sport.baseball       0.97      0.93      0.95       139
        rec.sport.hockey       0.95      0.96      0.96       126
               sci.crypt       0.86      0.96      0.90       123
         sci.electronics       0.86      0.74      0.80       116
                 sci.med       0.93      0.96      0.95       104
         

In [19]:
accuracy_score(y_test, y_pred)

0.8453380468404772

# Try Other Combinations

In [20]:
X=df['content'].astype(str).values
y=df['target'].astype(str).values

In [21]:
clf = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB())
])
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.2, random_state=1235)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.8453380468404772

In [23]:
clf = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', MultinomialNB())
])
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.2, random_state=1235)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.8435704816615113

In [24]:
clf = Pipeline([
    ('vect', TfidfVectorizer(min_df=5, max_df=0.8, ngram_range=(1, 2))),
    ('clf', MultinomialNB())
])
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.2, random_state=1235)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.861246133451171

# Try Other Classifiers

In [26]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

In [27]:
clf = Pipeline([
    ('vect', TfidfVectorizer(min_df=5, max_df=0.8, ngram_range=(1, 2))),
    ('clf', RandomForestClassifier())
])
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.2, random_state=1235)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.7317719840919134

In [28]:
clf = Pipeline([
    ('vect', TfidfVectorizer(min_df=5, max_df=0.8, ngram_range=(1, 2))),
    ('clf', SGDClassifier())
])
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.2, random_state=1235)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)



0.9186920017675652

In [32]:
clf = Pipeline([
    ('vect', TfidfVectorizer(min_df=5, max_df=0.8, ngram_range=(1, 2))),
    ('clf', KNeighborsClassifier())
])
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.2, random_state=1235)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.7551922227132125

In [33]:
# What if the results we get are just due to a LUCKY or UNLUCKY train test split?

# Cross Validation

In [36]:
def evaluate_cross_validation(clf, X, y, K):
    cv = KFold(K, shuffle=True)
    scores = cross_val_score(clf, X, y, cv=cv, n_jobs=-1)
    print(scores)
    print("Mean score: {0:.3f} ".format(np.mean(scores)))

In [41]:
clf = Pipeline([
    ('vect', TfidfVectorizer(min_df=5, max_df=0.8, ngram_range=(1, 2))),
    ('clf', SGDClassifier())
])
evaluate_cross_validation(clf, X, y, 4)

[0.91021562 0.91728526 0.92149929 0.92149929]
Mean score: 0.918 


# Summary

In [None]:
pipeline (mix and match)
multiple classifiers
multiple features
reports
accuracy
f1-score