In [36]:
import numpy as np
import pandas as pd  
import nltk
import gensim 
from gensim.models import Word2Vec
from sklearn.datasets import fetch_20newsgroups 
from sklearn.model_selection import train_test_split
import sklearn

In [14]:
categories = ['talk.politics.guns','rec.sport.baseball'] # We focus on 2 news categories
def get_data():
    data = fetch_20newsgroups(subset='all',
                              shuffle=True,
                              categories=categories,
                              remove=('headers', 'footers', 'quotes'))
    return data

In [15]:
# get text data and their labels
dataset = get_data()

corpus, labels = dataset.data, dataset.target

# split training dataset and testing dataset
train_corpus, test_corpus, train_labels, test_labels = train_test_split(corpus,
                                                                        labels,
                                                                        test_size=0.3)

In [16]:
#bow features
from sklearn.feature_extraction.text import CountVectorizer #tokenizes and counts words

# build bag of words features' vectorizer and get features
bow_vectorizer=CountVectorizer(min_df=1, ngram_range=(1,1))
bow_train_features = bow_vectorizer.fit_transform(train_corpus)
bow_test_features = bow_vectorizer.transform(test_corpus) 

In [17]:
# tfidf features
from sklearn.feature_extraction.text import TfidfVectorizer #alternatively, use TfidfTransformer()

tfidf_vectorizer=TfidfVectorizer(min_df=1, 
                                 norm='l2',
                                 smooth_idf=True,
                                 use_idf=True,
                                 ngram_range=(1,1))
tfidf_train_features = tfidf_vectorizer.fit_transform(train_corpus)  
tfidf_test_features = tfidf_vectorizer.transform(test_corpus)   

In [39]:
# tokenize documents for word2vec
tokenized_train = [nltk.word_tokenize(text)
                   for text in train_corpus]
tokenized_test = [nltk.word_tokenize(text)
                   for text in test_corpus]  

# build word2vec model                   
wv_model = gensim.models.Word2Vec(tokenized_train,
                               vector_size=200, #set the size or dimension for the word vectors. Used to be size = 200
                               window=60,       #specify the length of the window of words taken as context
                               min_count=10)    #ignores all words with total frequency lower than 

def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model.wv[word]) # Used to be model[word]
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector 
   

def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index_to_key)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

# averaged word vector features from word2vec
avg_wv_train_features = averaged_word_vectorizer(corpus=tokenized_train,
                                                 model=wv_model,
                                                 num_features=200)                   
avg_wv_test_features = averaged_word_vectorizer(corpus=tokenized_test,
                                                model=wv_model,
                                                num_features=200) 

In [40]:
from sklearn import metrics

# define a function to evaluate our classification models based on four metrics
def get_metrics(true_labels, predicted_labels):
    
    print ('Accuracy:', np.round(                                                    
                        metrics.accuracy_score(true_labels, 
                                               predicted_labels),
                        2))
    print ('Precision:', np.round(
                        metrics.precision_score(true_labels, 
                                               predicted_labels),
                        2))
    print ('Recall:', np.round(
                        metrics.recall_score(true_labels, 
                                               predicted_labels),
                        2))
    print ('F1 Score:', np.round(
                        metrics.f1_score(true_labels, 
                                               predicted_labels),
                        2))
                        

In [41]:
# define a function that trains the model, performs predictions and evaluates the predictions
def train_predict_evaluate_model(classifier, 
                                 train_features, train_labels, 
                                 test_features, test_labels):
    # build model    
    classifier.fit(train_features, train_labels)
    # predict using model
    predictions = classifier.predict(test_features) 
    # evaluate model prediction performance   
    get_metrics(true_labels=test_labels, 
                predicted_labels=predictions)
    return predictions    

In [42]:
# train and evaluate {knn} with {bow} feature
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn_bow_predictions = train_predict_evaluate_model(classifier=knn,
                                           train_features=bow_train_features,
                                           train_labels=train_labels,
                                           test_features=bow_test_features,
                                           test_labels=test_labels)

Accuracy: 0.69
Precision: 0.72
Recall: 0.62
F1 Score: 0.66


In [43]:
# build confusion matrix for knn bow model
cm = metrics.confusion_matrix(test_labels, knn_bow_predictions)
pd.DataFrame(cm, index=range(0,2), columns=range(0,2))

Unnamed: 0,0,1
0,226,68
1,107,171


In [44]:
# train and evaluate {knn} with tfidf feature
knn_tfidf_predictions = train_predict_evaluate_model(classifier=knn,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)

Accuracy: 0.53
Precision: 0.53
Recall: 0.32
F1 Score: 0.4


In [45]:
# build confusion matrix for knn tfidf model
cm = metrics.confusion_matrix(test_labels, knn_tfidf_predictions)
pd.DataFrame(cm, index=range(0,2), columns=range(0,2))

Unnamed: 0,0,1
0,213,81
1,188,90


In [46]:
# train and evaluate {knn} with word2vec feature
knn_word2vec_predictions = train_predict_evaluate_model(classifier=knn,
                                           train_features=avg_wv_train_features,
                                           train_labels=train_labels,
                                           test_features=avg_wv_test_features,
                                           test_labels=test_labels)

Accuracy: 0.82
Precision: 0.82
Recall: 0.82
F1 Score: 0.82


In [47]:
# build confusion matrix for knn word2vec model
cm = metrics.confusion_matrix(test_labels, knn_word2vec_predictions)
pd.DataFrame(cm, index=range(0,2), columns=range(0,2))

Unnamed: 0,0,1
0,243,51
1,50,228


In [48]:
# Observe false positive output
class_names = dataset.target_names
print (class_names[0], '->', class_names[1])

rec.sport.baseball -> talk.politics.guns


In [49]:
# Look at some misclassified documents in detail
import re

num = 0
for document, label, predicted_label in zip(test_corpus, test_labels, knn_bow_predictions):
    if label == 0 and predicted_label == 1:
        print ('Actual Label:', class_names[label])
        print ('Predicted Label:', class_names[predicted_label])
        print ('Document:-')
        print (re.sub('\n', ' ', document))
        num += 1
        if num == 4:
            break

Actual Label: rec.sport.baseball
Predicted Label: talk.politics.guns
Document:-
   WHO THINKS THE ASTROS ARE GOING PLACES??? THEY'RE CURRENTLY FIRST PLACE. THEY'RE 5-4, 5-1 ON THE ROAD!  
Actual Label: rec.sport.baseball
Predicted Label: talk.politics.guns
Document:-
Has David Wells landed with a team yet?  I'd think the Tigers with their  anemic pitching would grab this guy pronto!
Actual Label: rec.sport.baseball
Predicted Label: talk.politics.guns
Document:-
     Ted, you're missing a vital point.  As Roger Lustig pointed out in a previous response, the reason why Schott was banned from baseball was because she had been known to call and think in a racially biased manner on a constant basis.  Such thoughts affected her hiring practices.  Bonilla, on the other hand, was found to have mentioned this one word a single time.  If he had been known to go around, criticizing homosexuals, it would be a different story.  Furthermore, he is merely an athlete.  He doesn't have to hire anyone a

In [53]:
sorted(sklearn.neighbors.VALID_METRICS_SPARSE['brute']) # Get the list of avaliable metric

['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan', 'precomputed']

In [61]:
# train and evaluate the improved knn with bow features
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3, weights='distance', metric='cosine') #Changed the metric to cosine
knn_tfidf_predictions = train_predict_evaluate_model(classifier=knn,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)

Accuracy: 0.88
Precision: 0.81
Recall: 0.97
F1 Score: 0.89


In [51]:
# build confusion matrix for knn bow model
cm = metrics.confusion_matrix(test_labels, knn_tfidf_predictions)
pd.DataFrame(cm, index=range(0,2), columns=range(0,2))

Unnamed: 0,0,1
0,232,62
1,8,270


In [65]:
# Other approaches
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

mnb = MultinomialNB()
svm = SGDClassifier(loss='hinge', max_iter=100)

# Multinomial Naive Bayes with bag of words features
print('mnb_bow_predictions')
mnb_bow_predictions = train_predict_evaluate_model(classifier=mnb,
                                           train_features=bow_train_features,
                                           train_labels=train_labels,
                                           test_features=bow_test_features,
                                           test_labels=test_labels)
#Support Vector Machine with bag of words features'
print('\nsvm_bow_predictions')
svm_bow_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=bow_train_features,
                                           train_labels=train_labels,
                                           test_features=bow_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with tfidf features
print('\nsvm_tfidf_predictions')
svm_tfidf_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with averaged word vector features
print('\nsvm_avgwv_predictions')
svm_avgwv_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=avg_wv_train_features,
                                           train_labels=train_labels,
                                           test_features=avg_wv_test_features,
                                           test_labels=test_labels)

mnb_bow_predictions
Accuracy: 0.95
Precision: 0.95
Recall: 0.95
F1 Score: 0.95

svm_bow_predictions
Accuracy: 0.91
Precision: 0.91
Recall: 0.9
F1 Score: 0.9

svm_tfidf_predictions
Accuracy: 0.92
Precision: 0.95
Recall: 0.88
F1 Score: 0.92

svm_avgwv_predictions
Accuracy: 0.81
Precision: 0.76
Recall: 0.9
F1 Score: 0.83


In [66]:
# build confusion matrix for SVM TF-IDF-based model
cm = metrics.confusion_matrix(test_labels, svm_tfidf_predictions)
pd.DataFrame(cm, index=range(0,2), columns=range(0,2))  

# Observe false positive output
class_names = dataset.target_names
print (class_names[0], '->', class_names[1])

# Look at some misclassified documents in detail
import re

num = 0
for document, label, predicted_label in zip(test_corpus, test_labels, svm_tfidf_predictions):
    if label == 0 and predicted_label == 1:
        print ('Actual Label:', class_names[label])
        print ('Predicted Label:', class_names[predicted_label])
        print ('Document:-')
        print (re.sub('\n', ' ', document))
        num += 1
        if num == 4:
            break

rec.sport.baseball -> talk.politics.guns
Actual Label: rec.sport.baseball
Predicted Label: talk.politics.guns
Document:-
 I DID NOT WRITE THAT!  In fact, those statements were a rebuttal to an earlier posting that I made, and this was culled from my *strong* rebuttal to those statements.  PLEASE!  Slander.  Shame.     "after the game, it's no big deal" ????   After the employees leave the workplace, it doesn't matter what they say about the boss or the company?  Puhlease.     First, it's Ross Porter.  Second, I am really tired of seeing the kind of response that indicates that all I do is parrot what some media person says or writes.  I have a brain.  If I choose to characterize something in a certain fashion, it's because that is what I believe to be accurate.  It is not just because some unnamed "mediot" made the characterization.     I did *not* brand Darryl's response as petulant, because I never heard any response from Darryl.  I did call him a name.  I referred to him  as a prima