# Text Classification

In [95]:
import numpy as np
import pandas as pd  
import nltk
from nltk.corpus import stopwords
import gensim 
from sklearn.datasets import fetch_20newsgroups 
from sklearn.model_selection import train_test_split

# Get the data and take a look

In [96]:
categories = ['talk.politics.guns','rec.sport.baseball'] # We focus on 2 news categories
def get_data():
    data = fetch_20newsgroups(subset='all',
                              shuffle=True,
                              categories=categories,
                              remove=('headers', 'footers', 'quotes'))
    return data

In [97]:
# get text data and their labels
dataset = get_data()

corpus, labels = dataset.data, dataset.target

# split training dataset and testing dataset
train_corpus, test_corpus, train_labels, test_labels = train_test_split(corpus,
                                                                        labels,
                                                                        test_size=0.3)

# Prepape features for ML 
### {bow, tfidf, word2vec}

In [98]:
#bow features
from sklearn.feature_extraction.text import CountVectorizer #tokenizes and counts words

# build bag of words features' vectorizer and get features
bow_vectorizer=CountVectorizer(min_df=1, ngram_range=(1,1), stop_words = 'english')
bow_train_features = bow_vectorizer.fit_transform(train_corpus)
bow_test_features = bow_vectorizer.transform(test_corpus) 

In [99]:
# tfidf features
from sklearn.feature_extraction.text import TfidfVectorizer #alternatively, use TfidfTransformer()

tfidf_vectorizer=TfidfVectorizer(min_df=1, 
                                 norm='l2',
                                 smooth_idf=True,
                                 use_idf=True,
                                 ngram_range=(1,1),
                                stop_words = 'english')
tfidf_train_features = tfidf_vectorizer.fit_transform(train_corpus)  
tfidf_test_features = tfidf_vectorizer.transform(test_corpus)    

In [100]:
# tokenize documents for word2vec
tokenized_train = [nltk.word_tokenize(text)
                   for text in train_corpus]
tokenized_test = [nltk.word_tokenize(text)
                   for text in test_corpus]  

In [101]:
# build word2vec model                   
wv_model = gensim.models.Word2Vec(tokenized_train,
                               size=200,                          #set the size or dimension for the word vectors 
                               window=60,                        #specify the length of the window of words taken as context
                               min_count=10,)                   #ignores all words with total frequency lower than                     

In [102]:
def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    mystopwords = stopwords.words('english')
    new_words = [w.lower() for w in words if w.isalpha() if w.lower() not in mystopwords]
    
    for word in new_words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector 
   

def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

In [103]:
# averaged word vector features from word2vec
avg_wv_train_features = averaged_word_vectorizer(corpus=tokenized_train,
                                                 model=wv_model,
                                                 num_features=200)                   
avg_wv_test_features = averaged_word_vectorizer(corpus=tokenized_test,
                                                model=wv_model,
                                                num_features=200) 

  if sys.path[0] == '':


# Define metrics for evaluation

In [104]:
from sklearn import metrics

# define a function to evaluate our classification models based on four metrics
def get_metrics(true_labels, predicted_labels):
    
    print ('Accuracy:', np.round(                                                    
                        metrics.accuracy_score(true_labels, 
                                               predicted_labels),
                        2))
    print ('Precision:', np.round(
                        metrics.precision_score(true_labels, 
                                               predicted_labels),
                        2))
    print ('Recall:', np.round(
                        metrics.recall_score(true_labels, 
                                               predicted_labels),
                        2))
    print ('F1 Score:', np.round(
                        metrics.f1_score(true_labels, 
                                               predicted_labels),
                        2))

# Define how to train and evaluate classifier

In [105]:
# define a function that trains the model, performs predictions and evaluates the predictions
def train_predict_evaluate_model(classifier, 
                                 train_features, train_labels, 
                                 test_features, test_labels):
    # build model    
    classifier.fit(train_features, train_labels)
    # predict using model
    predictions = classifier.predict(test_features) 
    # evaluate model prediction performance   
    get_metrics(true_labels=test_labels, 
                predicted_labels=predictions)
    return predictions    

# Train and evaluate Decision Tree with {BoW, tfidf, word2vec} features

In [106]:
# import and bulid model
from sklearn import tree
clf = tree.DecisionTreeClassifier()

### Decision Tree with BoW feature

In [107]:
clf_bow_predictions = train_predict_evaluate_model(classifier=clf,
                                           train_features=bow_train_features,
                                           train_labels=train_labels,
                                           test_features=bow_test_features,
                                           test_labels=test_labels)

Accuracy: 0.87
Precision: 0.88
Recall: 0.8
F1 Score: 0.84


### Decision Tree with tfidf feature

In [108]:
clf_tfidf_predictions = train_predict_evaluate_model(classifier=clf,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)

Accuracy: 0.86
Precision: 0.87
Recall: 0.79
F1 Score: 0.83


### Decision Tree with word2vec feature

In [113]:
clf_avgwv_predictions = train_predict_evaluate_model(classifier=clf,
                                           train_features=avg_wv_train_features,
                                           train_labels=train_labels,
                                           test_features=avg_wv_test_features,
                                           test_labels=test_labels)

Accuracy: 0.85
Precision: 0.83
Recall: 0.83
F1 Score: 0.83


# Train and evaluate MNB with {BoW, tfidf} features

In [114]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()

# Multinomial Naive Bayes with bag of words features
mnb_bow_predictions = train_predict_evaluate_model(classifier=mnb,
                                           train_features=bow_train_features,
                                           train_labels=train_labels,
                                           test_features=bow_test_features,
                                           test_labels=test_labels)

Accuracy: 0.97
Precision: 0.96
Recall: 0.96
F1 Score: 0.96


In [115]:

mnb_tfidf_predictions = train_predict_evaluate_model(classifier=mnb,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)

Accuracy: 0.96
Precision: 0.96
Recall: 0.95
F1 Score: 0.95


# Train and evaluate SMV with {BoW, tfidf, word2vec} features

### BoW

In [116]:
from sklearn.linear_model import SGDClassifier
svm = SGDClassifier(loss='hinge', max_iter=100)
svm_bow_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=bow_train_features,
                                           train_labels=train_labels,
                                           test_features=bow_test_features,
                                           test_labels=test_labels)

Accuracy: 0.92
Precision: 0.93
Recall: 0.87
F1 Score: 0.9


### tfidf

In [117]:
svm_tfidf_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)

Accuracy: 0.94
Precision: 0.96
Recall: 0.9
F1 Score: 0.93


### word2vec

In [118]:
svm_avgwv_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=avg_wv_train_features,
                                           train_labels=train_labels,
                                           test_features=avg_wv_test_features,
                                           test_labels=test_labels)

Accuracy: 0.87
Precision: 0.82
Recall: 0.91
F1 Score: 0.86
