In [1]:
from gensim.models import word2vec
from gensim.models import KeyedVectors
from glove import Corpus, Glove
import pandas as pd
import re
from nltk.tokenize import sent_tokenize
import nltk
from sklearn.svm import SVC
import numpy as np

from collections import Counter, defaultdict
from tabulate import tabulate
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier



ImportError: No module named 'glove'

In [None]:
def create_glove(data, LEARNING_RATE=0.05, EPOCHS=5, NO_THREADS=4, EMBEDDING_DIM=100):
    model = None
    corpus = Corpus()
    corpus.fit(data, window=10)
    model = Glove(no_components=EMBEDDING_DIM,learning_rate=LEARNING_RATE)
    model.fit(corpus.matrix, epochs=EPOCHS,no_threads=NO_THREADS,verbose=True)
    model.add_dictionary(corpus.dictionary)
    return model

def create_word2vec(data,EMBEDDING_DIM=100):
    model = word2vec.Word2Vec(data, size=EMBEDDING_DIM)
    return model

class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(next(iter(word2vec.values())))

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

    
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(next(iter(word2vec.values())))

    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])
    
def preprocess_text(posts):
    text = str(posts['post_title'])+'. '+ str(posts['post_text'])
    text =  re.sub('tl[;]?dr','',text,flags=re.IGNORECASE)
    text = re.sub('[ \(\[]+[0-9]+[s]?[ /\(,)]*f[ \]\)]+',' ',text,flags=re.IGNORECASE)
    text = re.sub('[ \(\[]+[0-9]+[s]?[ /\(,)]*m[ \]\)]+',' ',text,flags=re.IGNORECASE)
    text = re.sub('[ \(\[]+f[ /\(,)]*[0-9]+[s]?[ \]\)]+',' ',text,flags=re.IGNORECASE)
    text = re.sub('[ \(\[]+m[ /\(,)]*[0-9]+[s]?[ \]\)]+',' ',text,flags=re.IGNORECASE)
    text = re.sub('[0-9]+','NUM',text,flags=re.IGNORECASE)
    text = re.sub('u/[^\s]+','AT_USER',text,flags=re.IGNORECASE)
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',text,flags=re.IGNORECASE)  #Convert www.* or https?://* to <url>
    text = text.split("[.]?\n[\* \[\(/]*[eE]dit")[0]
    text = text.split("[.]?\n[\* \[\(/]*EDIT")[0]
    text = text.split("[.]?\n[\* \[\(/]*big edit")[0]
    text = text.split("[.]?\n[\* \[\(/]*important edit")[0]
    text = text.split("[.]?\n[\* \[\(/]*[uU]pdate")[0]
    text = text.split("[.]?\n[\* \[\(/]*UPDATE")[0]
    text = text.split("[.]?\n[\* \[\(/]*big update")[0]
    text = text.split("[.]?\n[\* \[\(/]*important update")[0]
    text = text.split("[.]?\nfor an update")[0]
    text = text.replace('\r', '')
    return text

In [None]:
#prepare sentences
c_train = pd.read_csv('data/c_train2.csv')
c_test = pd.read_csv('data/c_test2.csv')
c_data = pd.concat([c_train,c_test],sort=False)
c_data = c_data.loc[:,['post_created_utc', 'full_link', 'post_id', 'post_num_comments',
       'post_score', 'subreddit', 'post_title', 'post_text']]

nc_train = pd.read_csv('data/nc_train2.csv')
nc_test = pd.read_csv('data/nc_test2.csv')
nc_data = pd.concat([nc_train,nc_test],sort=False)
nc_data = nc_data.loc[:,['post_created_utc', 'full_link', 'post_id', 'post_num_comments',
       'post_score', 'subreddit', 'post_title', 'post_text']]

full_data = pd.concat([c_data,nc_data],sort=False)
full_data = full_data.sample(len(full_data))
posts = full_data.apply(preprocess_text,axis=1)
data_sentences = []
for post in posts:
    sent_tokenize_list = sent_tokenize(post)
    data = [nltk.word_tokenize(sentence) for sentence in sent_tokenize_list]
    data_sentences = data_sentences + data 
len(data_sentences)

In [None]:
#define training set with label
c_data['class'] = 'c'
nc_data['class'] = 'nc'
train = pd.concat([c_data,nc_data])
train = train.sample(len(train)) #to shuffle
posts = train.apply(preprocess_text,axis=1)
X = []
for post in posts:
    data = [word for word in nltk.word_tokenize(post)]
    X.append(data)
y = train['class']

In [None]:
X = np.array(X)
y = np.array(y)
print(len(X),len(y))

In [2]:
print('start training...')
gloveModel = create_glove(data_sentences)
w2vModel = create_word2vec(data_sentences)
#is ok to train the model with the full dataset as we are not providing labels.
w2v = {w: vec for w, vec in zip(w2vModel.wv.index2word, w2vModel.wv.syn0)}
glove = {w: vec for w, vec in zip(gloveModel.dictionary, gloveModel.word_vectors)}

start training...


NameError: name 'create_glove' is not defined

In [45]:
etree_w2v = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)),
    ("extra trees", ExtraTreesClassifier(n_estimators=200))])
etree_w2v_tfidf = Pipeline([
    ("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v)),
    ("extra trees", ExtraTreesClassifier(n_estimators=200))])

etree_glove = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(glove)),
    ("extra trees", ExtraTreesClassifier(n_estimators=200))])
etree_glove_tfidf = Pipeline([
    ("word2vec vectorizer", TfidfEmbeddingVectorizer(glove)),
    ("extra trees", ExtraTreesClassifier(n_estimators=200))])

In [46]:
# start with the classics - naive bayes of the multinomial and bernoulli varieties
# with either pure counts or tfidf features
mult_nb = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("multinomial nb", MultinomialNB())])
bern_nb = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("bernoulli nb", BernoulliNB())])
mult_nb_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), ("multinomial nb", MultinomialNB())])
bern_nb_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), ("bernoulli nb", BernoulliNB())])

# SVM - which is supposed to be more or less state of the art 
# http://www.cs.cornell.edu/people/tj/publications/joachims_98a.pdf
svc = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("linear svc", SVC(kernel="linear"))])
svc_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), ("linear svc", SVC(kernel="linear"))])

In [53]:
#evaluation
scores = pd.read_csv('scores.csv')

all_models = [
    ("w2v", etree_w2v),
    ("w2v_tfidf", etree_w2v_tfidf),
    ("mult_nb", mult_nb),
    ("mult_nb_tfidf", mult_nb_tfidf),
#     ("svc", svc),
#     ("svc_tfidf", svc_tfidf),
    ("bern_nb", bern_nb),
    ("bern_nb_tfidf", bern_nb_tfidf),
    ("glove", etree_glove),
    ("glove_tfidf", etree_glove_tfidf)
]

kfold = 10
# scores = pd.DataFrame()
for name,model in all_models:
    if len(scores[(scores['model']==name) & (scores['kfold']==kfold)])>0:
        continue
    result = cross_val_score(model,X,y,cv=kfold).mean()
    scores = scores.append({'model':name,'score':result,'kfold':kfold}, ignore_index=True)
    scores.to_csv('scores.csv',encoding='utf-8',index=False)


In [56]:
#clustering w2v
print (w2vModel.similarity('this', 'is'))
# w2v

-0.2149282432880231


  
