# Text Classification

In [1]:
import numpy as np
import pandas as pd  
import nltk
from nltk.corpus import stopwords
import gensim 
from sklearn.datasets import fetch_20newsgroups 
from sklearn.model_selection import train_test_split

# Get the data and take a look

In [2]:
categories = ['talk.politics.guns','rec.sport.baseball'] # We focus on 2 news categories
def get_data():
    data = fetch_20newsgroups(subset='all',
                              shuffle=True,
                              categories=categories,
                              remove=('headers', 'footers', 'quotes'))
    return data

In [3]:
# get text data and their labels
dataset = get_data()

corpus, labels = dataset.data, dataset.target

# split training dataset and testing dataset
train_corpus, test_corpus, train_labels, test_labels = train_test_split(corpus,
                                                                        labels,
                                                                        test_size=0.3)

# Prepape features for ML 
### {word2vec}

In [4]:
# tokenize documents for word2vec
tokenized_train = [nltk.word_tokenize(text)
                   for text in train_corpus]
tokenized_test = [nltk.word_tokenize(text)
                   for text in test_corpus]  

In [5]:
# build word2vec model                   
wv_model = gensim.models.Word2Vec(tokenized_train,
                               size=200,                          #set the size or dimension for the word vectors 
                               window=60,                        #specify the length of the window of words taken as context
                               min_count=10,)                   #ignores all words with total frequency lower than                     

In [7]:
def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    #Stopwords Removal and only keep text data then change to lowercase
    mystopwords = stopwords.words('english')
    new_words = [w.lower() for w in words if w.isalpha() if w.lower() not in mystopwords]
    
    #Use Porter Stemmer 
    porter = nltk.PorterStemmer()
    stem = [porter.stem(w) for w in new_words]
    
    # use POS tages (NN,NNP)
    POS_tags = nltk.pos_tag(stem)
    list_all_none = [word for (word, tag) in POS_tags if tag == 'NN' or tag == 'NNP']
    
    for word in list_all_none:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector 
   

def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

In [8]:
# averaged word vector features from word2vec
avg_wv_train_features = averaged_word_vectorizer(corpus=tokenized_train,
                                                 model=wv_model,
                                                 num_features=200)                   
avg_wv_test_features = averaged_word_vectorizer(corpus=tokenized_test,
                                                model=wv_model,
                                                num_features=200) 



# Define metrics for evaluation

In [9]:
from sklearn import metrics

# define a function to evaluate our classification models based on four metrics
def get_metrics(true_labels, predicted_labels):
    
    print ('Accuracy:', np.round(                                                    
                        metrics.accuracy_score(true_labels, 
                                               predicted_labels),
                        2))
    print ('Precision:', np.round(
                        metrics.precision_score(true_labels, 
                                               predicted_labels),
                        2))
    print ('Recall:', np.round(
                        metrics.recall_score(true_labels, 
                                               predicted_labels),
                        2))
    print ('F1 Score:', np.round(
                        metrics.f1_score(true_labels, 
                                               predicted_labels),
                        2))

# Define how to train and evaluate classifier

In [10]:
# define a function that trains the model, performs predictions and evaluates the predictions
def train_predict_evaluate_model(classifier, 
                                 train_features, train_labels, 
                                 test_features, test_labels):
    # build model    
    classifier.fit(train_features, train_labels)
    # predict using model
    predictions = classifier.predict(test_features) 
    # evaluate model prediction performance   
    get_metrics(true_labels=test_labels, 
                predicted_labels=predictions)
    return predictions    

# Decision Tree with {word2vec} features

In [16]:
# import and bulid model
from sklearn import tree
clf = tree.DecisionTreeClassifier()

In [17]:
clf_avgwv_predictions = train_predict_evaluate_model(classifier=clf,
                                           train_features=avg_wv_train_features,
                                           train_labels=train_labels,
                                           test_features=avg_wv_test_features,
                                           test_labels=test_labels)

Accuracy: 0.85
Precision: 0.84
Recall: 0.81
F1 Score: 0.82


# SVM with {word2vec} features

In [15]:
from sklearn.linear_model import SGDClassifier
svm = SGDClassifier(loss='hinge', max_iter=100)
svm_avgwv_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=avg_wv_train_features,
                                           train_labels=train_labels,
                                           test_features=avg_wv_test_features,
                                           test_labels=test_labels)

Accuracy: 0.81
Precision: 0.74
Recall: 0.91
F1 Score: 0.81
