In [89]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

<font size=6>1.1 Overview</font>


In [90]:
# Question 1.1. able to addresses all 20 categories.
# extract dataset
twenty_train = fetch_20newsgroups(data_home=None, subset='train', categories=None, shuffle=True, random_state=123)
twenty_test = fetch_20newsgroups(data_home=None, subset='test', categories=None, shuffle=True, random_state=321)


# printing overview
print('Number of files/documents: ', len(twenty_train.data))
print('Number of targets/categories: ', len(twenty_train.target_names))


Number of files/documents:  11314
Number of targets/categories:  20


<font size=6>1.2.1 Naïve Bayes with three types of feature (counts, tf, tfidf)</font>

In [131]:
# Question 1.2. using three classifiers/types of feature on this multi-class classification task with four distinct paramerters.

dict_results = dict()

def Naive_bayes(train, test, lowercase=True, stopwords=None, ngram=(1,1), analyzer='word', max_features=None, print_report=False):

    lst_results_train = []
    lst_results_test = []
    
    # (1) feature: counts
    NB_clf_counts = Pipeline([
        ("vect", CountVectorizer(lowercase=lowercase, stop_words=stopwords, ngram_range=ngram, analyzer=analyzer, max_features=max_features)), 
        ("clf", MultinomialNB())
        ])
    NB_clf_counts.fit(train.data, train.target)
    train_pred = NB_clf_counts.predict(train.data)
    test_pred = NB_clf_counts.predict(test.data)

    # get scores
    tr, tst = scores(train.target, train_pred, test.target, test_pred)
    lst_results_train.append(tr)
    lst_results_test.append(tst)
    
    # get results in detail
    if print_report == True:
        report(train, train_pred, 'train', 'counts')
        report(test, test_pred, 'test', 'counts')

    # (2) feature: tf
    NB_clf_tf = Pipeline([
        ("vect", CountVectorizer(lowercase=lowercase, stop_words=stopwords, ngram_range=ngram, analyzer=analyzer, max_features=max_features)), 
        ('tf',TfidfTransformer(use_idf=False)),
        ("clf", MultinomialNB())
        ])
    NB_clf_tf.fit(train.data, train.target)
    train_pred = NB_clf_tf.predict(train.data)
    test_pred = NB_clf_tf.predict(test.data)

    # get scores
    tr, tst = scores(train.target, train_pred, test.target, test_pred)
    lst_results_train.append(tr)
    lst_results_test.append(tst)
    
    # get results in detail
    if print_report == True:
        report(train, train_pred, 'train', 'tf')
        report(test, test_pred, 'test', 'tf')

    # (3) feature: tf-idf
    NB_clf_tfidf = Pipeline([
        ("vect", CountVectorizer(lowercase=lowercase, stop_words=stopwords, ngram_range=ngram, analyzer=analyzer, max_features=max_features)), 
        ('tf',TfidfTransformer(use_idf=True)),
        ("clf", MultinomialNB())
        ])
    NB_clf_tfidf.fit(train.data, train.target)
    train_pred = NB_clf_tfidf.predict(train.data)
    test_pred = NB_clf_tfidf.predict(test.data)

    # get scores
    tr, tst = scores(train.target, train_pred, test.target, test_pred)
    lst_results_train.append(tr)
    lst_results_test.append(tst)

    # get results in detail
    if print_report == True:
        report(train, train_pred, 'train', 'tfidf')
        report(test, test_pred, 'test', 'tfidf')
    
    dict_results['NB_train'] = lst_results_train
    dict_results['NB_test'] = lst_results_test

<font size=6>1.2.2 Support Vector Machine with three types of feature (counts, tf, tfidf) </font>

In [138]:
def SVM(train, test, print_report = False):
    
    lst_results_train = []
    lst_results_test = []

    # (1) feature: counts
    SVM_clf_counts = Pipeline([
        ("vect", CountVectorizer()), 
        ("clf", SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=789,
                          max_iter=5, tol=None))
        ])
    SVM_clf_counts.fit(train.data, train.target)
    train_pred = SVM_clf_counts.predict(train.data)
    test_pred = SVM_clf_counts.predict(test.data)
    
    # get scores
    tr, tst = scores(train.target, train_pred, test.target, test_pred)
    lst_results_train.append(tr)
    lst_results_test.append(tst)
    
    if print_report == True:
        report(train, train_pred, 'train', 'counts')
        report(test, test_pred, 'test', 'counts')

    
    # (2) feature: tf
    SVM_clf_tf = Pipeline([
        ("vect", CountVectorizer()), 
        ('tf',TfidfTransformer(use_idf=False)),
        ("clf", SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=789,
                          max_iter=5, tol=None))
        ])
    SVM_clf_tf.fit(train.data, train.target)
    train_pred = SVM_clf_tf.predict(train.data)
    test_pred = SVM_clf_tf.predict(test.data)
    
    # get scores
    tr, tst = scores(train.target, train_pred, test.target, test_pred)
    lst_results_train.append(tr)
    lst_results_test.append(tst)

    
    if print_report == True:
        report(train, train_pred, 'train', 'tf')
        report(test, test_pred, 'test', 'tf')

    # (3) feature: tf-idf
    SVM_clf_tfidf = Pipeline([
        ("vect", CountVectorizer()), 
        ('tf',TfidfTransformer(use_idf=True)),
        ("clf", SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=789,
                          max_iter=5, tol=None))
        ])
    SVM_clf_tfidf.fit(train.data, train.target)
    train_pred = SVM_clf_tfidf.predict(train.data)
    test_pred = SVM_clf_tfidf.predict(test.data)
    
    # get scores
    tr, tst = scores(train.target, train_pred, test.target, test_pred)
    lst_results_train.append(tr)
    lst_results_test.append(tst)

    if print_report == True:
        report(train, train_pred, 'train', 'tfidf')
        report(test, test_pred, 'test', 'tfidf')
    
    dict_results['SVM_train'] = lst_results_train
    dict_results['SVM_test'] = lst_results_test

<font size=6>1.2.3 RandomForest with three types of feature (counts, tf, tfidf)</font>

In [140]:
def RandomForest(train, test, print_report=False):

    lst_results_train = []
    lst_results_test = []
    
    # (1) feature: counts
    RF_clf_counts = Pipeline([
        ("vect", CountVectorizer()), 
        ("clf",RandomForestClassifier())
        ])
    RF_clf_counts.fit(train.data, train.target)
    train_pred = RF_clf_counts.predict(train.data)
    test_pred = RF_clf_counts.predict(test.data)

    # get scores
    tr, tst = scores(train.target, train_pred, test.target, test_pred)
    lst_results_train.append(tr)
    lst_results_test.append(tst)

    if print_report == True:
            report(train, train_pred, 'train', 'counts')
            report(test, test_pred, 'test', 'counts')

    # (2) feature: tf
    RF_clf_tf = Pipeline([
        ("vect", CountVectorizer()), 
        ('tf',TfidfTransformer(use_idf=False)),
        ("clf",RandomForestClassifier())
        ])
    RF_clf_tf.fit(train.data, train.target)
    train_pred = RF_clf_tf.predict(train.data)
    test_pred = RF_clf_tf.predict(test.data)

    # get scores
    tr, tst = scores(train.target, train_pred, test.target, test_pred)
    lst_results_train.append(tr)
    lst_results_test.append(tst)

    if print_report == True:
        report(train, train_pred, 'train', 'tf')
        report(test, test_pred, 'test', 'tf')

    # (3) feature: tf-idf
    RF_clf_counts = Pipeline([
        ("vect", CountVectorizer()), 
        ('tf',TfidfTransformer(use_idf=True)),
        ("clf",RandomForestClassifier())
        ])
    RF_clf_counts.fit(train.data, train.target)
    train_pred = RF_clf_counts.predict(train.data)
    test_pred = RF_clf_counts.predict(test.data)
    
    # get scores
    tr, tst = scores(train.target, train_pred, test.target, test_pred)
    lst_results_train.append(tr)
    lst_results_test.append(tst)

    if print_report == True:
            report(train, train_pred, 'train', 'tfidf')
            report(test, test_pred, 'test', 'tfidf')


    dict_results['RF_train'] = lst_results_train
    dict_results['RF_test'] = lst_results_test

In [135]:
def scores(label_train, pred_train, label_test, pred_test):

    # train
    acc_tr = np.mean(pred_train == label_train)
    f1_tr = f1_score(label_train, pred_train, average='macro')
    pr_tr = precision_score(label_train, pred_train, average='macro')
    recall_tr = recall_score(label_train, pred_train,  average='macro')

    # test
    acc_tst = np.mean(pred_test == label_test)
    f1_tst = f1_score(label_test, pred_test, average='macro')
    pr_tst = precision_score(label_test, pred_test, average='macro')
    recall_tst = recall_score(label_test, pred_test,  average='macro')
    return [acc_tr, f1_tr, pr_tr, recall_tr], [acc_tst, f1_tst, pr_tst, recall_tst]

1.2 Score function (ACC, F1, RECALL, PRECISION)

In [136]:
# Question 1.2.4 score function
def report(dataset, pred, data_type, feature_type):
    # clf_results_report
    print(data_type, feature_type)
    print(metrics.classification_report(dataset.target, pred, target_names=dataset.target_names))
    # clf_results_confusion_matrix
    # plt.figure(figsize=(20,25))
    # cm = confusion_matrix(dataset.target, pred)
    # disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=dataset.target_names)
    # disp.plot()
    return metrics.classification_report(dataset.target, pred, target_names=dataset.target_names)

<font size=6> 1.2 Results_Table </font>

In [142]:
# acc (train)
Naive_bayes(twenty_train, twenty_test)
SVM(twenty_train, twenty_test)
RandomForest(twenty_train, twenty_test)

acc_nb = [item[0] for item in dict_results['NB_train']]
acc_svm = [item[0] for item in dict_results['SVM_train']]
acc_rf = [item[0] for item in dict_results['RF_train']]

df = pd.DataFrame({'Naive Bayes': acc_nb, 'SVM': acc_svm, 'RandomForest': acc_rf}, index=['counts', 'tf', 'tf-idf'])
print('>>>Accuracy')
print('Train:')
print(df)
print()

# acc (test)
acc_nb = [item[0] for item in dict_results['NB_test']]
acc_svm = [item[0] for item in dict_results['SVM_test']]
acc_rf = [item[0] for item in dict_results['RF_test']]

df = pd.DataFrame({'Naive Bayes': acc_nb, 'SVM': acc_svm, 'RandomForest': acc_rf}, index=['counts', 'tf', 'tf-idf'])
print('Test:')
print(df)
print()



>>>Accuracy
Train:
        Naive Bayes       SVM  RandomForest
counts     0.924518  0.982941      0.999912
tf         0.843910  0.916210      0.999912
tf-idf     0.932650  0.965353      0.999912

Test:
        Naive Bayes       SVM  RandomForest
counts     0.772836  0.756240      0.766729
tf         0.705258  0.774960      0.756771
tf-idf     0.773898  0.823686      0.757833



In [144]:
# f1-score (train)
Naive_bayes(twenty_train, twenty_test)
SVM(twenty_train, twenty_test)
RandomForest(twenty_train, twenty_test)

f1_nb = [item[1] for item in dict_results['NB_train']]
f1_svm = [item[1] for item in dict_results['SVM_train']]
f1_rf = [item[1] for item in dict_results['RF_train']]

df = pd.DataFrame({'Naive Bayes': f1_nb, 'SVM': f1_svm, 'RandomForest': f1_rf}, index=['counts', 'tf', 'tf-idf'])
print('>>>F1-score')
print('Train:')
print(df)
print()

# f1-score (test)
f1_nb = [item[1] for item in dict_results['NB_test']]
f1_svm = [item[1] for item in dict_results['SVM_test']]
f1_rf = [item[1] for item in dict_results['RF_test']]

df = pd.DataFrame({'Naive Bayes': f1_nb, 'SVM': f1_svm, 'RandomForest': f1_rf}, index=['counts', 'tf', 'tf-idf'])
print('Test:')
print(df)
print()

>>>F1-score
Train:
        Naive Bayes       SVM  RandomForest
counts     0.910492  0.981577      0.999916
tf         0.811418  0.913249      0.999916
tf-idf     0.919829  0.962300      0.999916

Test:
        Naive Bayes       SVM  RandomForest
counts     0.745098  0.744843      0.755293
tf         0.672783  0.762184      0.743052
tf-idf     0.755754  0.810701      0.750656



In [145]:
# precision (train)
Naive_bayes(twenty_train, twenty_test)
SVM(twenty_train, twenty_test)
RandomForest(twenty_train, twenty_test)

pr_nb = [item[2] for item in dict_results['NB_train']]
pr_svm = [item[2] for item in dict_results['SVM_train']]
pr_rf = [item[2] for item in dict_results['RF_train']]

df = pd.DataFrame({'Naive Bayes': pr_nb, 'SVM': pr_svm, 'RandomForest': pr_rf}, index=['counts', 'tf', 'tf-idf'])
print('>>>Precision')
print('Train:')
print(df)
print()

# precision (test)
pr_nb = [item[2] for item in dict_results['NB_test']]
pr_svm = [item[2] for item in dict_results['SVM_test']]
pr_rf = [item[2] for item in dict_results['RF_test']]

df = pd.DataFrame({'Naive Bayes': pr_nb, 'SVM': pr_svm, 'RandomForest': pr_rf}, index=['counts', 'tf', 'tf-idf'])
print('Test:')
print(df)
print()

>>>Precision
Train:
        Naive Bayes       SVM  RandomForest
counts     0.934817  0.982938      0.999916
tf         0.892994  0.921196      0.999916
tf-idf     0.945940  0.967068      0.999916

Test:
        Naive Bayes       SVM  RandomForest
counts     0.762163  0.758820      0.774657
tf         0.792431  0.774403      0.762383
tf-idf     0.825531  0.826490      0.766009



In [146]:
# recall (train)
Naive_bayes(twenty_train, twenty_test)
SVM(twenty_train, twenty_test)
RandomForest(twenty_train, twenty_test)

recall_nb = [item[3] for item in dict_results['NB_train']]
recall_svm = [item[3] for item in dict_results['SVM_train']]
recall_rf = [item[3] for item in dict_results['RF_train']]

df = pd.DataFrame({'Naive Bayes': recall_nb, 'SVM': recall_svm, 'RandomForest': recall_rf}, index=['counts', 'tf', 'tf-idf'])
print('>>>Recall')
print('Train:')
print(df)
print()

# recall (test)
recall_nb = [item[3] for item in dict_results['NB_test']]
recall_svm = [item[3] for item in dict_results['SVM_test']]
recall_rf = [item[3] for item in dict_results['RF_test']]

df = pd.DataFrame({'Naive Bayes': recall_nb, 'SVM': recall_svm, 'RandomForest': recall_rf}, index=['counts', 'tf', 'tf-idf'])
print('Test:')
print(df)
print()

>>>Recall
Train:
        Naive Bayes       SVM  RandomForest
counts     0.922883  0.980575      0.999916
tf         0.819943  0.910769      0.999916
tf-idf     0.919103  0.960484      0.999916

Test:
        Naive Bayes       SVM  RandomForest
counts     0.763646  0.745118      0.754594
tf         0.682195  0.763804      0.745149
tf-idf     0.756525  0.812133      0.745164



<font size=6>1.2 Classification report </font>

In [147]:
Naive_bayes(twenty_train, twenty_test, print_report=True) # classification_report

train counts
                          precision    recall  f1-score   support

             alt.atheism       0.93      0.98      0.95       480
           comp.graphics       0.85      0.97      0.90       584
 comp.os.ms-windows.misc       0.98      0.14      0.25       591
comp.sys.ibm.pc.hardware       0.70      0.96      0.81       590
   comp.sys.mac.hardware       0.95      0.98      0.97       578
          comp.windows.x       0.80      0.98      0.88       593
            misc.forsale       0.97      0.85      0.91       585
               rec.autos       0.96      0.98      0.97       594
         rec.motorcycles       0.99      0.98      0.99       598
      rec.sport.baseball       1.00      0.99      0.99       597
        rec.sport.hockey       0.98      0.99      0.98       600
               sci.crypt       0.92      0.99      0.95       595
         sci.electronics       0.95      0.96      0.96       591
                 sci.med       0.98      0.98      0.98       

In [148]:
SVM(twenty_train, twenty_test, print_report=True)

train counts
                          precision    recall  f1-score   support

             alt.atheism       0.99      0.94      0.96       480
           comp.graphics       0.96      0.99      0.97       584
 comp.os.ms-windows.misc       0.99      0.98      0.99       591
comp.sys.ibm.pc.hardware       0.99      0.97      0.98       590
   comp.sys.mac.hardware       0.99      0.99      0.99       578
          comp.windows.x       1.00      1.00      1.00       593
            misc.forsale       0.98      0.99      0.99       585
               rec.autos       0.99      1.00      0.99       594
         rec.motorcycles       0.99      0.99      0.99       598
      rec.sport.baseball       0.98      1.00      0.99       597
        rec.sport.hockey       0.99      1.00      1.00       600
               sci.crypt       0.99      1.00      0.99       595
         sci.electronics       0.95      0.99      0.97       591
                 sci.med       0.98      0.99      0.99       

In [84]:
RandomForest(twenty_train, twenty_test, print_report=True)

train counts
                          precision    recall  f1-score   support

             alt.atheism       1.00      1.00      1.00       480
           comp.graphics       1.00      1.00      1.00       584
 comp.os.ms-windows.misc       1.00      1.00      1.00       591
comp.sys.ibm.pc.hardware       1.00      1.00      1.00       590
   comp.sys.mac.hardware       1.00      1.00      1.00       578
          comp.windows.x       1.00      1.00      1.00       593
            misc.forsale       1.00      1.00      1.00       585
               rec.autos       1.00      1.00      1.00       594
         rec.motorcycles       1.00      1.00      1.00       598
      rec.sport.baseball       1.00      1.00      1.00       597
        rec.sport.hockey       1.00      1.00      1.00       600
               sci.crypt       1.00      1.00      1.00       595
         sci.electronics       1.00      1.00      1.00       591
                 sci.med       1.00      1.00      1.00       

<font size=6> 1.4 Different values for four distinct parameters (lowercase, stop_words, analyzer, max_features) on Naive Bayes</font>


In [64]:
# default
acc = Naive_bayes(twenty_train, twenty_test)

# print
pd.DataFrame(acc, index=['Counts', 'tf', 'tf-idf'])

Unnamed: 0,NB_train,NB_test
Counts,0.924518,0.772836
tf,0.84391,0.705258
tf-idf,0.93265,0.773898


In [151]:
# lowercase
Naive_bayes(twenty_train, twenty_test, lowercase=False)
acc = [item[0] for item in dict_results['NB_train']]
acc = [item[0] for item in dict_results['NB_test']]

# print
df = pd.DataFrame({'Naive Bayes on train': acc, 'Naive Bayes on test': acc}, index=['Counts', 'tf', 'tf-idf'])
print('lowercase (True -> False)')
print(df)


lowercase (True -> False)
        Naive Bayes on train  Naive Bayes on test
Counts              0.766994             0.766994
tf                  0.692512             0.692512
tf-idf              0.768986             0.768986


In [152]:
# stopwords
Naive_bayes(twenty_train, twenty_test, stopwords='english')
acc = [item[0] for item in dict_results['NB_train']]
acc = [item[0] for item in dict_results['NB_test']]

# print
df = pd.DataFrame({'Naive Bayes on train': acc, 'Naive Bayes on test': acc}, index=['Counts', 'tf', 'tf-idf'])
print('stopwords (None -> \'english\')')
print(df)

stopwords (None -> 'english')
        Naive Bayes on train  Naive Bayes on test
Counts              0.802310             0.802310
tf                  0.790361             0.790361
tf-idf              0.816914             0.816914


In [154]:
# analyzer (in combination with ngram_range)
Naive_bayes(twenty_train, twenty_test, analyzer='word', ngram=(1,2))
acc = [item[0] for item in dict_results['NB_train']]
acc = [item[0] for item in dict_results['NB_test']]

# print
df = pd.DataFrame({'Naive Bayes on train': acc, 'Naive Bayes on test': acc}, index=['Counts', 'tf', 'tf-idf'])
print('ngram ( (1,1) -> (1,2) )')
print(df)


ngram ( (1,1) -> (1,2) )
        Naive Bayes on train  Naive Bayes on test
Counts              0.739511             0.739511
tf                  0.692910             0.692910
tf-idf              0.765401             0.765401


In [157]:
# max_features
Naive_bayes(twenty_train, twenty_test, max_features=10000)

acc = [item[0] for item in dict_results['NB_train']]
acc = [item[0] for item in dict_results['NB_test']]

# print
df = pd.DataFrame({'Naive Bayes on train': acc, 'Naive Bayes on test': acc}, index=['Counts', 'tf', 'tf-idf'])
print('max_features ( None -> 10000)')
print(df)

max_features ( None -> 10000)
        Naive Bayes on train  Naive Bayes on test
Counts              0.772438             0.772438
tf                  0.724509             0.724509
tf-idf              0.793548             0.793548
