# Data Augmentation with Back Translation and Thesarius

## Experiments on the 4 categories of the 20Newsgroups Dataset

## NB and SVM - with BOW, TFIDF, averaged word2vec, TFIDF weighted averaged word2vec

Define the extraction functions:

In [44]:
from sklearn.feature_extraction.text import CountVectorizer
#
def bow_extractor(corpus, ngram_range=(1,1)):
    
    vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features
    
    
from sklearn.feature_extraction.text import TfidfTransformer
#
def tfidf_transformer(bow_matrix):
    
    transformer = TfidfTransformer(norm='l2',
                                   smooth_idf=True,
                                   use_idf=True)
    tfidf_matrix = transformer.fit_transform(bow_matrix)
    return transformer, tfidf_matrix
    
    
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_extractor(corpus, ngram_range=(1,1)):
    
    vectorizer = TfidfVectorizer(min_df=1, 
                                 norm='l2',
                                 smooth_idf=True,
                                 use_idf=True,
                                 ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features
    

import numpy as np    
    
def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector
    
#   
def averaged_word_vectorizer(corpus, model, num_features):
    #vocabulary = set(model.index2word)
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)
    
    
def tfidf_wtd_avg_word_vectors(words, tfidf_vector, tfidf_vocabulary, model, num_features):
    
    word_tfidfs = [tfidf_vector[0, tfidf_vocabulary.get(word)] 
                   if tfidf_vocabulary.get(word) 
                   else 0 for word in words]    
    word_tfidf_map = {word:tfidf_val for word, tfidf_val in zip(words, word_tfidfs)}
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    vocabulary = set(model.wv.index2word)
    wts = 0.
    for word in words:
        if word in vocabulary: 
            word_vector = model[word]
            weighted_word_vector = word_tfidf_map[word] * word_vector
            wts = wts + word_tfidf_map[word]
            feature_vector = np.add(feature_vector, weighted_word_vector)
    if wts:
        feature_vector = np.divide(feature_vector, wts)
        
    return feature_vector

#
def tfidf_weighted_averaged_word_vectorizer(corpus, tfidf_vectors, 
                                   tfidf_vocabulary, model, num_features):
                                       
    docs_tfidfs = [(doc, doc_tfidf) 
                   for doc, doc_tfidf 
                   in zip(corpus, tfidf_vectors)]
    features = [tfidf_wtd_avg_word_vectors(tokenized_sentence, tfidf, tfidf_vocabulary,
                                   model, num_features)
                    for tokenized_sentence, tfidf in docs_tfidfs]
    return np.array(features) 

## Experiments on the original (not augmented Dataset):

100 original documents

In [45]:
from sklearn.datasets import fetch_20newsgroups
import nltk
import gensim

categories = ['alt.atheism', 'comp.graphics','sci.med','talk.religion.misc']

twenty_train = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42)
twenty_train.data = twenty_train.data[:100]
twenty_train.target = twenty_train.target[:100]

twenty_test = fetch_20newsgroups(subset='test',
categories=categories, shuffle=True, random_state=42)


bow_vectorizer, bow_train_features = bow_extractor(twenty_train.data)  
bow_test_features = bow_vectorizer.transform(twenty_test.data) 

# tfidf features 
tfidf_vectorizer, tfidf_train_features = tfidf_extractor(twenty_train.data)  
tfidf_test_features = tfidf_vectorizer.transform(twenty_test.data)

# tokenize documents
tokenized_train = [nltk.word_tokenize(text)
                   for text in twenty_train.data]
tokenized_test = [nltk.word_tokenize(text)
                   for text in twenty_test.data]  
# build word2vec model                   
model = gensim.models.Word2Vec(tokenized_train,
                               size=500,
                               window=100,
                               min_count=30,
                               sample=1e-3)                  
                   
# averaged word vector features
avg_wv_train_features = averaged_word_vectorizer(corpus=tokenized_train,
                                                 model=model,
                                                 num_features=500)                   
avg_wv_test_features = averaged_word_vectorizer(corpus=tokenized_test,
                                                model=model,
                                                num_features=500)                                                 
                   


# tfidf weighted averaged word vector features
vocab = tfidf_vectorizer.vocabulary_
tfidf_wv_train_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_train, 
                                                                  tfidf_vectors=tfidf_train_features, 
                                                                  tfidf_vocabulary=vocab, 
                                                                  model=model, 
                                                                  num_features=500)
tfidf_wv_test_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_test, 
                                                                 tfidf_vectors=tfidf_test_features, 
                                                                 tfidf_vocabulary=vocab, 
                                                                 model=model, 
                                                                 num_features=500)


# metrics

def train_predict_evaluate_model(classifier, train_features, train_labels, 
                                 test_features, test_labels):
    # build model    
    classifier.fit(train_features, train_labels)
    # predict using model
    predictions = classifier.predict(test_features) 
    print(np.mean(predictions == test_labels))
    return predictions  






In [46]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

mnb = MultinomialNB()
svm = SGDClassifier(loss='hinge', n_iter=100)

train_labels = twenty_train.target
test_labels = twenty_test.target
# Multinomial Naive Bayes with bag of words features
print("Multinomial Naive Bayes with bag of words features:")
mnb_bow_predictions = train_predict_evaluate_model(classifier=mnb,
                                           train_features=bow_train_features,
                                           train_labels=train_labels,
                                           test_features=bow_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with bag of words features
print("Support Vector Machine with bag of words features:")
svm_bow_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=bow_train_features,
                                           train_labels=train_labels,
                                           test_features=bow_test_features,
                                           test_labels=test_labels)
                                           
# Multinomial Naive Bayes with tfidf features  
print("Multinomial Naive Bayes with tfidf features :")
mnb_tfidf_predictions = train_predict_evaluate_model(classifier=mnb,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with tfidf features
print("Support Vector Machine with tfidf features:")
svm_tfidf_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with averaged word vector features
print("Support Vector Machine with averaged word vector features:")
svm_avgwv_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=avg_wv_train_features,
                                           train_labels=train_labels,
                                           test_features=avg_wv_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with tfidf weighted averaged word vector features
print("Support Vector Machine with tfidf weighted averaged word vector features:")
svm_tfidfwv_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=tfidf_wv_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_wv_test_features,
                                           test_labels=test_labels)
print("DONE")


Multinomial Naive Bayes with bag of words features:
0.720295202952
Support Vector Machine with bag of words features:
0.583763837638
Multinomial Naive Bayes with tfidf features :
0.431734317343
Support Vector Machine with tfidf features:
0.693726937269
Support Vector Machine with averaged word vector features:
0.329151291513
Support Vector Machine with tfidf weighted averaged word vector features:
0.285608856089
DONE




200 original documents:

In [48]:
from sklearn.datasets import fetch_20newsgroups
import nltk
import gensim

categories = ['alt.atheism', 'comp.graphics','sci.med','talk.religion.misc']
#categories = ['alt.atheism', 'comp.graphics']

twenty_train = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42)
twenty_train.data = twenty_train.data[:200]
twenty_train.target = twenty_train.target[:200]

twenty_test = fetch_20newsgroups(subset='test',
categories=categories, shuffle=True, random_state=42)


bow_vectorizer, bow_train_features = bow_extractor(twenty_train.data)  
bow_test_features = bow_vectorizer.transform(twenty_test.data) 

# tfidf features 
tfidf_vectorizer, tfidf_train_features = tfidf_extractor(twenty_train.data)  
tfidf_test_features = tfidf_vectorizer.transform(twenty_test.data)

# tokenize documents
tokenized_train = [nltk.word_tokenize(text)
                   for text in twenty_train.data]
tokenized_test = [nltk.word_tokenize(text)
                   for text in twenty_test.data]  
# build word2vec model                   
model = gensim.models.Word2Vec(tokenized_train,
                               size=500,
                               window=100,
                               min_count=30,
                               sample=1e-3)                  
                   
# averaged word vector features
avg_wv_train_features = averaged_word_vectorizer(corpus=tokenized_train,
                                                 model=model,
                                                 num_features=500)                   
avg_wv_test_features = averaged_word_vectorizer(corpus=tokenized_test,
                                                model=model,
                                                num_features=500)                                                 
                   


# tfidf weighted averaged word vector features
vocab = tfidf_vectorizer.vocabulary_
tfidf_wv_train_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_train, 
                                                                  tfidf_vectors=tfidf_train_features, 
                                                                  tfidf_vocabulary=vocab, 
                                                                  model=model, 
                                                                  num_features=500)
tfidf_wv_test_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_test, 
                                                                 tfidf_vectors=tfidf_test_features, 
                                                                 tfidf_vocabulary=vocab, 
                                                                 model=model, 
                                                                 num_features=500)
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

mnb = MultinomialNB()
svm = SGDClassifier(loss='hinge', n_iter=100)

train_labels = twenty_train.target
test_labels = twenty_test.target
# Multinomial Naive Bayes with bag of words features
print("Multinomial Naive Bayes with bag of words features:")
mnb_bow_predictions = train_predict_evaluate_model(classifier=mnb,
                                           train_features=bow_train_features,
                                           train_labels=train_labels,
                                           test_features=bow_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with bag of words features
print("Support Vector Machine with bag of words features:")
svm_bow_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=bow_train_features,
                                           train_labels=train_labels,
                                           test_features=bow_test_features,
                                           test_labels=test_labels)
                                           
# Multinomial Naive Bayes with tfidf features  
print("Multinomial Naive Bayes with tfidf features :")
mnb_tfidf_predictions = train_predict_evaluate_model(classifier=mnb,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with tfidf features
print("Support Vector Machine with tfidf features:")
svm_tfidf_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with averaged word vector features
print("Support Vector Machine with averaged word vector features:")
svm_avgwv_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=avg_wv_train_features,
                                           train_labels=train_labels,
                                           test_features=avg_wv_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with tfidf weighted averaged word vector features
print("Support Vector Machine with tfidf weighted averaged word vector features:")
svm_tfidfwv_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=tfidf_wv_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_wv_test_features,
                                           test_labels=test_labels)
print("DONE")





Multinomial Naive Bayes with bag of words features:
0.731365313653
Support Vector Machine with bag of words features:
0.589667896679
Multinomial Naive Bayes with tfidf features :
0.577121771218
Support Vector Machine with tfidf features:
0.730627306273
Support Vector Machine with averaged word vector features:
0.372693726937
Support Vector Machine with tfidf weighted averaged word vector features:




0.340221402214
DONE


300 original documents:

In [49]:
from sklearn.datasets import fetch_20newsgroups
import nltk
import gensim

categories = ['alt.atheism', 'comp.graphics','sci.med','talk.religion.misc']
#categories = ['alt.atheism', 'comp.graphics']

twenty_train = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42)
twenty_train.data = twenty_train.data[:300]
twenty_train.target = twenty_train.target[:300]

twenty_test = fetch_20newsgroups(subset='test',
categories=categories, shuffle=True, random_state=42)


bow_vectorizer, bow_train_features = bow_extractor(twenty_train.data)  
bow_test_features = bow_vectorizer.transform(twenty_test.data) 

# tfidf features 
tfidf_vectorizer, tfidf_train_features = tfidf_extractor(twenty_train.data)  
tfidf_test_features = tfidf_vectorizer.transform(twenty_test.data)

# tokenize documents
tokenized_train = [nltk.word_tokenize(text)
                   for text in twenty_train.data]
tokenized_test = [nltk.word_tokenize(text)
                   for text in twenty_test.data]  
# build word2vec model                   
model = gensim.models.Word2Vec(tokenized_train,
                               size=500,
                               window=100,
                               min_count=30,
                               sample=1e-3)                  
                   
# averaged word vector features
avg_wv_train_features = averaged_word_vectorizer(corpus=tokenized_train,
                                                 model=model,
                                                 num_features=500)                   
avg_wv_test_features = averaged_word_vectorizer(corpus=tokenized_test,
                                                model=model,
                                                num_features=500)                                                 
                   


# tfidf weighted averaged word vector features
vocab = tfidf_vectorizer.vocabulary_
tfidf_wv_train_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_train, 
                                                                  tfidf_vectors=tfidf_train_features, 
                                                                  tfidf_vocabulary=vocab, 
                                                                  model=model, 
                                                                  num_features=500)
tfidf_wv_test_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_test, 
                                                                 tfidf_vectors=tfidf_test_features, 
                                                                 tfidf_vocabulary=vocab, 
                                                                 model=model, 
                                                                 num_features=500)
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

mnb = MultinomialNB()
svm = SGDClassifier(loss='hinge', n_iter=100)

train_labels = twenty_train.target
test_labels = twenty_test.target
# Multinomial Naive Bayes with bag of words features
print("Multinomial Naive Bayes with bag of words features:")
mnb_bow_predictions = train_predict_evaluate_model(classifier=mnb,
                                           train_features=bow_train_features,
                                           train_labels=train_labels,
                                           test_features=bow_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with bag of words features
print("Support Vector Machine with bag of words features:")
svm_bow_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=bow_train_features,
                                           train_labels=train_labels,
                                           test_features=bow_test_features,
                                           test_labels=test_labels)
                                           
# Multinomial Naive Bayes with tfidf features  
print("Multinomial Naive Bayes with tfidf features :")
mnb_tfidf_predictions = train_predict_evaluate_model(classifier=mnb,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with tfidf features
print("Support Vector Machine with tfidf features:")
svm_tfidf_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with averaged word vector features
print("Support Vector Machine with averaged word vector features:")
svm_avgwv_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=avg_wv_train_features,
                                           train_labels=train_labels,
                                           test_features=avg_wv_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with tfidf weighted averaged word vector features
print("Support Vector Machine with tfidf weighted averaged word vector features:")
svm_tfidfwv_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=tfidf_wv_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_wv_test_features,
                                           test_labels=test_labels)
print("DONE")



Multinomial Naive Bayes with bag of words features:
0.781549815498
Support Vector Machine with bag of words features:
0.654612546125
Multinomial Naive Bayes with tfidf features :
0.729889298893
Support Vector Machine with tfidf features:
0.783763837638
Support Vector Machine with averaged word vector features:
0.314391143911
Support Vector Machine with tfidf weighted averaged word vector features:




0.450184501845
DONE


2000 original documents:


In [50]:
from sklearn.datasets import fetch_20newsgroups
import nltk
import gensim

categories = ['alt.atheism', 'comp.graphics','sci.med','talk.religion.misc']
#categories = ['alt.atheism', 'comp.graphics']

twenty_train = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42)
twenty_train.data = twenty_train.data[:2000]
twenty_train.target = twenty_train.target[:2000]

twenty_test = fetch_20newsgroups(subset='test',
categories=categories, shuffle=True, random_state=42)


bow_vectorizer, bow_train_features = bow_extractor(twenty_train.data)  
bow_test_features = bow_vectorizer.transform(twenty_test.data) 

# tfidf features 
tfidf_vectorizer, tfidf_train_features = tfidf_extractor(twenty_train.data)  
tfidf_test_features = tfidf_vectorizer.transform(twenty_test.data)

# tokenize documents
tokenized_train = [nltk.word_tokenize(text)
                   for text in twenty_train.data]
tokenized_test = [nltk.word_tokenize(text)
                   for text in twenty_test.data]  
# build word2vec model                   
model = gensim.models.Word2Vec(tokenized_train,
                               size=500,
                               window=100,
                               min_count=30,
                               sample=1e-3)                  
                   
# averaged word vector features
avg_wv_train_features = averaged_word_vectorizer(corpus=tokenized_train,
                                                 model=model,
                                                 num_features=500)                   
avg_wv_test_features = averaged_word_vectorizer(corpus=tokenized_test,
                                                model=model,
                                                num_features=500)                                                 
                   


# tfidf weighted averaged word vector features
vocab = tfidf_vectorizer.vocabulary_
tfidf_wv_train_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_train, 
                                                                  tfidf_vectors=tfidf_train_features, 
                                                                  tfidf_vocabulary=vocab, 
                                                                  model=model, 
                                                                  num_features=500)
tfidf_wv_test_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_test, 
                                                                 tfidf_vectors=tfidf_test_features, 
                                                                 tfidf_vocabulary=vocab, 
                                                                 model=model, 
                                                                 num_features=500)
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

mnb = MultinomialNB()
svm = SGDClassifier(loss='hinge', n_iter=100)

train_labels = twenty_train.target
test_labels = twenty_test.target
# Multinomial Naive Bayes with bag of words features
print("Multinomial Naive Bayes with bag of words features:")
mnb_bow_predictions = train_predict_evaluate_model(classifier=mnb,
                                           train_features=bow_train_features,
                                           train_labels=train_labels,
                                           test_features=bow_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with bag of words features
print("Support Vector Machine with bag of words features:")
svm_bow_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=bow_train_features,
                                           train_labels=train_labels,
                                           test_features=bow_test_features,
                                           test_labels=test_labels)
                                           
# Multinomial Naive Bayes with tfidf features  
print("Multinomial Naive Bayes with tfidf features :")
mnb_tfidf_predictions = train_predict_evaluate_model(classifier=mnb,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with tfidf features
print("Support Vector Machine with tfidf features:")
svm_tfidf_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with averaged word vector features
print("Support Vector Machine with averaged word vector features:")
svm_avgwv_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=avg_wv_train_features,
                                           train_labels=train_labels,
                                           test_features=avg_wv_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with tfidf weighted averaged word vector features
print("Support Vector Machine with tfidf weighted averaged word vector features:")
svm_tfidfwv_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=tfidf_wv_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_wv_test_features,
                                           test_labels=test_labels)
print("DONE")



Multinomial Naive Bayes with bag of words features:
0.891512915129
Support Vector Machine with bag of words features:




0.815498154982
Multinomial Naive Bayes with tfidf features :
0.828782287823
Support Vector Machine with tfidf features:




0.873800738007
Support Vector Machine with averaged word vector features:




0.657564575646
Support Vector Machine with tfidf weighted averaged word vector features:




0.661254612546
DONE


## Experiments on the augmented Dataset:

## With Back-Translation

Use 100 original documents, generate 100 new with Back translation (=200 documents)

In [53]:
from mtranslate import translate
from sklearn.datasets import fetch_20newsgroups
import nltk
import gensim

categories = ['alt.atheism', 'comp.graphics','sci.med','talk.religion.misc']
#categories = ['alt.atheism', 'comp.graphics']

twenty_train = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42)
twenty_train.data = twenty_train.data[:100]
twenty_train.target = twenty_train.target[:100]

twenty_test = fetch_20newsgroups(subset='test',
categories=categories, shuffle=True, random_state=42)

for i in range(len(twenty_train.data)):
    print(i)
    ger_transl = translate(twenty_train.data[i], "de")
    new = translate(ger_transl, "en")
    (twenty_train.data).append(new)
    twenty_train.target = np.append(twenty_train.target, twenty_train.target[i])
print(len(twenty_train.data))    

bow_vectorizer, bow_train_features = bow_extractor(twenty_train.data)  
bow_test_features = bow_vectorizer.transform(twenty_test.data) 

# tfidf features 
tfidf_vectorizer, tfidf_train_features = tfidf_extractor(twenty_train.data)  
tfidf_test_features = tfidf_vectorizer.transform(twenty_test.data)

# tokenize documents
tokenized_train = [nltk.word_tokenize(text)
                   for text in twenty_train.data]
tokenized_test = [nltk.word_tokenize(text)
                   for text in twenty_test.data]  
# build word2vec model                   
model = gensim.models.Word2Vec(tokenized_train,
                               size=500,
                               window=100,
                               min_count=30,
                               sample=1e-3)                  
                   
# averaged word vector features
avg_wv_train_features = averaged_word_vectorizer(corpus=tokenized_train,
                                                 model=model,
                                                 num_features=500)                   
avg_wv_test_features = averaged_word_vectorizer(corpus=tokenized_test,
                                                model=model,
                                                num_features=500)                                                 
                   


# tfidf weighted averaged word vector features
vocab = tfidf_vectorizer.vocabulary_
tfidf_wv_train_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_train, 
                                                                  tfidf_vectors=tfidf_train_features, 
                                                                  tfidf_vocabulary=vocab, 
                                                                  model=model, 
                                                                  num_features=500)
tfidf_wv_test_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_test, 
                                                                 tfidf_vectors=tfidf_test_features, 
                                                                 tfidf_vocabulary=vocab, 
                                                                 model=model, 
                                                                 num_features=500)
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

mnb = MultinomialNB()
svm = SGDClassifier(loss='hinge', n_iter=100)

train_labels = twenty_train.target
test_labels = twenty_test.target
# Multinomial Naive Bayes with bag of words features
print("Multinomial Naive Bayes with bag of words features:")
mnb_bow_predictions = train_predict_evaluate_model(classifier=mnb,
                                           train_features=bow_train_features,
                                           train_labels=train_labels,
                                           test_features=bow_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with bag of words features
print("Support Vector Machine with bag of words features:")
svm_bow_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=bow_train_features,
                                           train_labels=train_labels,
                                           test_features=bow_test_features,
                                           test_labels=test_labels)
                                           
# Multinomial Naive Bayes with tfidf features  
print("Multinomial Naive Bayes with tfidf features :")
mnb_tfidf_predictions = train_predict_evaluate_model(classifier=mnb,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with tfidf features
print("Support Vector Machine with tfidf features:")
svm_tfidf_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with averaged word vector features
print("Support Vector Machine with averaged word vector features:")
svm_avgwv_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=avg_wv_train_features,
                                           train_labels=train_labels,
                                           test_features=avg_wv_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with tfidf weighted averaged word vector features
print("Support Vector Machine with tfidf weighted averaged word vector features:")
svm_tfidfwv_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=tfidf_wv_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_wv_test_features,
                                           test_labels=test_labels)
print("DONE")



0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
200




Multinomial Naive Bayes with bag of words features:
0.735055350554
Support Vector Machine with bag of words features:
0.578597785978
Multinomial Naive Bayes with tfidf features :
0.513653136531
Support Vector Machine with tfidf features:
0.691512915129
Support Vector Machine with averaged word vector features:
0.365313653137
Support Vector Machine with tfidf weighted averaged word vector features:
0.332103321033
DONE




In [None]:
BT thes

Use 100 original documents, generate 200 new with Back translation with German and French as target languages (=300 documents)

In [54]:
from mtranslate import translate
from sklearn.datasets import fetch_20newsgroups
import nltk
import gensim

categories = ['alt.atheism', 'comp.graphics','sci.med','talk.religion.misc']
#ategories = ['alt.atheism', 'comp.graphics']

twenty_train = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42)
twenty_train.data = twenty_train.data[:100]
twenty_train.target = twenty_train.target[:100]

twenty_test = fetch_20newsgroups(subset='test',
categories=categories, shuffle=True, random_state=42)

for i in range(len(twenty_train.data)):
    print(i)
    ger_transl = translate(twenty_train.data[i], "de")
    new = translate(ger_transl, "en")
    (twenty_train.data).append(new)
    twenty_train.target = np.append(twenty_train.target, twenty_train.target[i])
    fr_transl = translate(twenty_train.data[i], "fr")
    new1 = translate(fr_transl, "en")
    (twenty_train.data).append(new1)
    twenty_train.target = np.append(twenty_train.target, twenty_train.target[i])
print(len(twenty_train.data))    

bow_vectorizer, bow_train_features = bow_extractor(twenty_train.data)  
bow_test_features = bow_vectorizer.transform(twenty_test.data) 

# tfidf features 
tfidf_vectorizer, tfidf_train_features = tfidf_extractor(twenty_train.data)  
tfidf_test_features = tfidf_vectorizer.transform(twenty_test.data)

# tokenize documents
tokenized_train = [nltk.word_tokenize(text)
                   for text in twenty_train.data]
tokenized_test = [nltk.word_tokenize(text)
                   for text in twenty_test.data]  
# build word2vec model                   
model = gensim.models.Word2Vec(tokenized_train,
                               size=500,
                               window=100,
                               min_count=30,
                               sample=1e-3)                  
                   
# averaged word vector features
avg_wv_train_features = averaged_word_vectorizer(corpus=tokenized_train,
                                                 model=model,
                                                 num_features=500)                   
avg_wv_test_features = averaged_word_vectorizer(corpus=tokenized_test,
                                                model=model,
                                                num_features=500)                                                 
                   


# tfidf weighted averaged word vector features
vocab = tfidf_vectorizer.vocabulary_
tfidf_wv_train_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_train, 
                                                                  tfidf_vectors=tfidf_train_features, 
                                                                  tfidf_vocabulary=vocab, 
                                                                  model=model, 
                                                                  num_features=500)
tfidf_wv_test_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_test, 
                                                                 tfidf_vectors=tfidf_test_features, 
                                                                 tfidf_vocabulary=vocab, 
                                                                 model=model, 
                                                                 num_features=500)
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

mnb = MultinomialNB()
svm = SGDClassifier(loss='hinge', n_iter=100)

train_labels = twenty_train.target
test_labels = twenty_test.target
# Multinomial Naive Bayes with bag of words features
print("Multinomial Naive Bayes with bag of words features:")
mnb_bow_predictions = train_predict_evaluate_model(classifier=mnb,
                                           train_features=bow_train_features,
                                           train_labels=train_labels,
                                           test_features=bow_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with bag of words features
print("Support Vector Machine with bag of words features:")
svm_bow_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=bow_train_features,
                                           train_labels=train_labels,
                                           test_features=bow_test_features,
                                           test_labels=test_labels)
                                           
# Multinomial Naive Bayes with tfidf features  
print("Multinomial Naive Bayes with tfidf features :")
mnb_tfidf_predictions = train_predict_evaluate_model(classifier=mnb,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with tfidf features
print("Support Vector Machine with tfidf features:")
svm_tfidf_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with averaged word vector features
print("Support Vector Machine with averaged word vector features:")
svm_avgwv_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=avg_wv_train_features,
                                           train_labels=train_labels,
                                           test_features=avg_wv_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with tfidf weighted averaged word vector features
print("Support Vector Machine with tfidf weighted averaged word vector features:")
svm_tfidfwv_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=tfidf_wv_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_wv_test_features,
                                           test_labels=test_labels)
print("DONE")



0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
300




Multinomial Naive Bayes with bag of words features:
0.742435424354
Support Vector Machine with bag of words features:
0.59557195572
Multinomial Naive Bayes with tfidf features :
0.561623616236
Support Vector Machine with tfidf features:
0.693726937269
Support Vector Machine with averaged word vector features:
0.371955719557
Support Vector Machine with tfidf weighted averaged word vector features:




0.40221402214
DONE


## With Thesarius

In [55]:
import nltk
from nltk.corpus import wordnet as wn

our_input = "I have seen quite a few data augmentation techniques for image data. However, there's not been much work found online on data augmentation techniques for text data. Any suggestions in this regard will be appreciated."
def thesarius(our_input):
    translation = ""
    punctuation = [".",",","?","!",":",";","...","\"","\'","\'\''","(",")","[","]","{","}"]
    for word in our_input.split(" "):
        clean_word = ""
        for el in word:
            if el not in punctuation:
                clean_word+=el
        synon = wn.synsets(clean_word)
        alle = []
        for el in synon:
            a = el.lemma_names()
            alle.append(a[0])
        if len(alle) > 0:
            a = alle[0]
            if a.split("_"):
                x = ""
                a = a.split("_")
                for el in a:
                    x+=el
                    x+=" "
                x.strip()
                a = x
            translation+=a
            translation+=" "
        else:
            translation+=clean_word 
            translation+=" "
    if translation != our_input: 
        return translation
    print("- Our data:\n", our_input)
    print("\n- New data:\n", translation)
    
if __name__ == "__main__":
    thesarius(our_input)


Use 100 original documents, generate 100 new with Thesarius (=200 documents)

In [56]:

from sklearn.datasets import fetch_20newsgroups
import nltk
import gensim

categories = ['alt.atheism', 'comp.graphics','sci.med','talk.religion.misc']
#categories = ['alt.atheism', 'comp.graphics']

twenty_train = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42)
twenty_train.data = twenty_train.data[:100]
twenty_train.target = twenty_train.target[:100]

twenty_test = fetch_20newsgroups(subset='test',
categories=categories, shuffle=True, random_state=42)

for i in range(len(twenty_train.data)):
    print(i)
    new1 = thesarius(twenty_train.data[i])
    (twenty_train.data).append(new1)
    twenty_train.target = np.append(twenty_train.target, twenty_train.target[i])
print(len(twenty_train.data))    

bow_vectorizer, bow_train_features = bow_extractor(twenty_train.data)  
bow_test_features = bow_vectorizer.transform(twenty_test.data) 

# tfidf features 
tfidf_vectorizer, tfidf_train_features = tfidf_extractor(twenty_train.data)  
tfidf_test_features = tfidf_vectorizer.transform(twenty_test.data)

# tokenize documents
tokenized_train = [nltk.word_tokenize(text)
                   for text in twenty_train.data]
tokenized_test = [nltk.word_tokenize(text)
                   for text in twenty_test.data]  
# build word2vec model                   
model = gensim.models.Word2Vec(tokenized_train,
                               size=500,
                               window=100,
                               min_count=30,
                               sample=1e-3)                  
                   
# averaged word vector features
avg_wv_train_features = averaged_word_vectorizer(corpus=tokenized_train,
                                                 model=model,
                                                 num_features=500)                   
avg_wv_test_features = averaged_word_vectorizer(corpus=tokenized_test,
                                                model=model,
                                                num_features=500)                                                 
                   


# tfidf weighted averaged word vector features
vocab = tfidf_vectorizer.vocabulary_
tfidf_wv_train_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_train, 
                                                                  tfidf_vectors=tfidf_train_features, 
                                                                  tfidf_vocabulary=vocab, 
                                                                  model=model, 
                                                                  num_features=500)
tfidf_wv_test_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_test, 
                                                                 tfidf_vectors=tfidf_test_features, 
                                                                 tfidf_vocabulary=vocab, 
                                                                 model=model, 
                                                                 num_features=500)
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

mnb = MultinomialNB()
svm = SGDClassifier(loss='hinge', n_iter=100)

train_labels = twenty_train.target
test_labels = twenty_test.target
# Multinomial Naive Bayes with bag of words features
print("Multinomial Naive Bayes with bag of words features:")
mnb_bow_predictions = train_predict_evaluate_model(classifier=mnb,
                                           train_features=bow_train_features,
                                           train_labels=train_labels,
                                           test_features=bow_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with bag of words features
print("Support Vector Machine with bag of words features:")
svm_bow_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=bow_train_features,
                                           train_labels=train_labels,
                                           test_features=bow_test_features,
                                           test_labels=test_labels)
                                           
# Multinomial Naive Bayes with tfidf features  
print("Multinomial Naive Bayes with tfidf features :")
mnb_tfidf_predictions = train_predict_evaluate_model(classifier=mnb,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with tfidf features
print("Support Vector Machine with tfidf features:")
svm_tfidf_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with averaged word vector features
print("Support Vector Machine with averaged word vector features:")
svm_avgwv_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=avg_wv_train_features,
                                           train_labels=train_labels,
                                           test_features=avg_wv_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with tfidf weighted averaged word vector features
print("Support Vector Machine with tfidf weighted averaged word vector features:")
svm_tfidfwv_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=tfidf_wv_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_wv_test_features,
                                           test_labels=test_labels)
print("DONE")



0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
200




Multinomial Naive Bayes with bag of words features:
0.723985239852
Support Vector Machine with bag of words features:
0.611070110701
Multinomial Naive Bayes with tfidf features :
0.532103321033
Support Vector Machine with tfidf features:
0.692250922509
Support Vector Machine with averaged word vector features:
0.287822878229
Support Vector Machine with tfidf weighted averaged word vector features:




0.315867158672
DONE


Use 1000 original documents, generate 1000 new with Thesarius (=2000 documents)

In [57]:
from sklearn.datasets import fetch_20newsgroups
import nltk
import gensim

categories = ['alt.atheism', 'comp.graphics','sci.med','talk.religion.misc']
#categories = ['alt.atheism', 'comp.graphics']

twenty_train = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42)
twenty_train.data = twenty_train.data[:1000]
twenty_train.target = twenty_train.target[:1000]

twenty_test = fetch_20newsgroups(subset='test',
categories=categories, shuffle=True, random_state=42)

for i in range(len(twenty_train.data)):
    print(i)
    new1 = thesarius(twenty_train.data[i])
    (twenty_train.data).append(new1)
    twenty_train.target = np.append(twenty_train.target, twenty_train.target[i])
print(len(twenty_train.data))    

bow_vectorizer, bow_train_features = bow_extractor(twenty_train.data)  
bow_test_features = bow_vectorizer.transform(twenty_test.data) 

# tfidf features 
tfidf_vectorizer, tfidf_train_features = tfidf_extractor(twenty_train.data)  
tfidf_test_features = tfidf_vectorizer.transform(twenty_test.data)

# tokenize documents
tokenized_train = [nltk.word_tokenize(text)
                   for text in twenty_train.data]
tokenized_test = [nltk.word_tokenize(text)
                   for text in twenty_test.data]  
# build word2vec model                   
model = gensim.models.Word2Vec(tokenized_train,
                               size=500,
                               window=100,
                               min_count=30,
                               sample=1e-3)                  
                   
# averaged word vector features
avg_wv_train_features = averaged_word_vectorizer(corpus=tokenized_train,
                                                 model=model,
                                                 num_features=500)                   
avg_wv_test_features = averaged_word_vectorizer(corpus=tokenized_test,
                                                model=model,
                                                num_features=500)                                                 
                   


# tfidf weighted averaged word vector features
vocab = tfidf_vectorizer.vocabulary_
tfidf_wv_train_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_train, 
                                                                  tfidf_vectors=tfidf_train_features, 
                                                                  tfidf_vocabulary=vocab, 
                                                                  model=model, 
                                                                  num_features=500)
tfidf_wv_test_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_test, 
                                                                 tfidf_vectors=tfidf_test_features, 
                                                                 tfidf_vocabulary=vocab, 
                                                                 model=model, 
                                                                 num_features=500)
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

mnb = MultinomialNB()
svm = SGDClassifier(loss='hinge', n_iter=100)

train_labels = twenty_train.target
test_labels = twenty_test.target
# Multinomial Naive Bayes with bag of words features
print("Multinomial Naive Bayes with bag of words features:")
mnb_bow_predictions = train_predict_evaluate_model(classifier=mnb,
                                           train_features=bow_train_features,
                                           train_labels=train_labels,
                                           test_features=bow_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with bag of words features
print("Support Vector Machine with bag of words features:")
svm_bow_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=bow_train_features,
                                           train_labels=train_labels,
                                           test_features=bow_test_features,
                                           test_labels=test_labels)
                                           
# Multinomial Naive Bayes with tfidf features  
print("Multinomial Naive Bayes with tfidf features :")
mnb_tfidf_predictions = train_predict_evaluate_model(classifier=mnb,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with tfidf features
print("Support Vector Machine with tfidf features:")
svm_tfidf_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with averaged word vector features
print("Support Vector Machine with averaged word vector features:")
svm_avgwv_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=avg_wv_train_features,
                                           train_labels=train_labels,
                                           test_features=avg_wv_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with tfidf weighted averaged word vector features
print("Support Vector Machine with tfidf weighted averaged word vector features:")
svm_tfidfwv_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=tfidf_wv_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_wv_test_features,
                                           test_labels=test_labels)
print("DONE")

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27



Multinomial Naive Bayes with bag of words features:
0.872324723247
Support Vector Machine with bag of words features:




0.787453874539
Multinomial Naive Bayes with tfidf features :
0.838376383764
Support Vector Machine with tfidf features:




0.851660516605
Support Vector Machine with averaged word vector features:




0.653136531365
Support Vector Machine with tfidf weighted averaged word vector features:




0.652398523985
DONE


## Back Translation with Thesarius:

Use 100 original documents, generate 200 new with Thesarius (=3000 documents)

In [58]:
from mtranslate import translate
from sklearn.datasets import fetch_20newsgroups
import nltk
import gensim

categories = ['alt.atheism', 'comp.graphics','sci.med','talk.religion.misc']
#categories = ['alt.atheism', 'comp.graphics']

twenty_train = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42)
twenty_train.data = twenty_train.data[:100]
twenty_train.target = twenty_train.target[:100]

twenty_test = fetch_20newsgroups(subset='test',
categories=categories, shuffle=True, random_state=42)

for i in range(len(twenty_train.data)):
    print(i)
    ger_transl = translate(twenty_train.data[i], "de")
    new = translate(ger_transl, "en")
    (twenty_train.data).append(new)
    twenty_train.target = np.append(twenty_train.target, twenty_train.target[i])
    new1 = thesarius(twenty_train.data[i])
    (twenty_train.data).append(new1)
    twenty_train.target = np.append(twenty_train.target, twenty_train.target[i])
print(len(twenty_train.data))    

bow_vectorizer, bow_train_features = bow_extractor(twenty_train.data)  
bow_test_features = bow_vectorizer.transform(twenty_test.data) 

# tfidf features 
tfidf_vectorizer, tfidf_train_features = tfidf_extractor(twenty_train.data)  
tfidf_test_features = tfidf_vectorizer.transform(twenty_test.data)

# tokenize documents
tokenized_train = [nltk.word_tokenize(text)
                   for text in twenty_train.data]
tokenized_test = [nltk.word_tokenize(text)
                   for text in twenty_test.data]  
# build word2vec model                   
model = gensim.models.Word2Vec(tokenized_train,
                               size=500,
                               window=100,
                               min_count=30,
                               sample=1e-3)                  
                   
# averaged word vector features
avg_wv_train_features = averaged_word_vectorizer(corpus=tokenized_train,
                                                 model=model,
                                                 num_features=500)                   
avg_wv_test_features = averaged_word_vectorizer(corpus=tokenized_test,
                                                model=model,
                                                num_features=500)                                                 
                   


# tfidf weighted averaged word vector features
vocab = tfidf_vectorizer.vocabulary_
tfidf_wv_train_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_train, 
                                                                  tfidf_vectors=tfidf_train_features, 
                                                                  tfidf_vocabulary=vocab, 
                                                                  model=model, 
                                                                  num_features=500)
tfidf_wv_test_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_test, 
                                                                 tfidf_vectors=tfidf_test_features, 
                                                                 tfidf_vocabulary=vocab, 
                                                                 model=model, 
                                                                 num_features=500)
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

mnb = MultinomialNB()
svm = SGDClassifier(loss='hinge', n_iter=100)

train_labels = twenty_train.target
test_labels = twenty_test.target
# Multinomial Naive Bayes with bag of words features
print("Multinomial Naive Bayes with bag of words features:")
mnb_bow_predictions = train_predict_evaluate_model(classifier=mnb,
                                           train_features=bow_train_features,
                                           train_labels=train_labels,
                                           test_features=bow_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with bag of words features
print("Support Vector Machine with bag of words features:")
svm_bow_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=bow_train_features,
                                           train_labels=train_labels,
                                           test_features=bow_test_features,
                                           test_labels=test_labels)
                                           
# Multinomial Naive Bayes with tfidf features  
print("Multinomial Naive Bayes with tfidf features :")
mnb_tfidf_predictions = train_predict_evaluate_model(classifier=mnb,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with tfidf features
print("Support Vector Machine with tfidf features:")
svm_tfidf_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with averaged word vector features
print("Support Vector Machine with averaged word vector features:")
svm_avgwv_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=avg_wv_train_features,
                                           train_labels=train_labels,
                                           test_features=avg_wv_test_features,
                                           test_labels=test_labels)

# Support Vector Machine with tfidf weighted averaged word vector features
print("Support Vector Machine with tfidf weighted averaged word vector features:")
svm_tfidfwv_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=tfidf_wv_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_wv_test_features,
                                           test_labels=test_labels)
print("DONE")



0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
300




Multinomial Naive Bayes with bag of words features:
0.739483394834
Support Vector Machine with bag of words features:
0.608118081181
Multinomial Naive Bayes with tfidf features :
0.557933579336
Support Vector Machine with tfidf features:
0.69594095941
Support Vector Machine with averaged word vector features:
0.323247232472
Support Vector Machine with tfidf weighted averaged word vector features:




0.328413284133
DONE
