In [1]:
"""topic modeling of the tweets using Latent Dirichlet Allocation (LDA) and
   Non-negative matrix factorization (NMF). Run this with the rigt flag to get topic features for train/test""" 

# Thumbs up to https://goo.gl/EGEDw5 for on-point demo

import numpy as np
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from ekphrasis.classes.preprocessor import TextPreProcessor 
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

# reading data
from load import parse_dataset

In [2]:
TRAIN = True # set this flag to True to get train features, and False to get test features  

In [3]:
if TRAIN:
    dataset='../datasets/train/SemEval2018-T3-train-taskB_emoji.txt'
    corpus, _ = parse_dataset(dataset)
else:
    dataset='../datasets/test_TaskB/SemEval2018-T3_input_test_taskB_emoji.txt'
    corpus = parse_dataset(dataset)


In [4]:
text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    # hashtag anotation is not shown (segmented content of the hashtag is still included), as it might confuse topic model
    annotate={ "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},     fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction is set to True to help topic model 
    all_caps_tag="every",
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
    )

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


In [5]:
X = [" ".join(text_processor.pre_process_doc(x)) for x in corpus]
X = [x.split() for x in corpus]

In [6]:
def chunkIt(seq, n):
    """splits the list into n approximately equal sub-lists. source: goo.gl/VrHKeR"""
    avg = len(seq) / float(n)
    out = []
    last = 0.0
    while last < len(seq):
        out.append(seq[int(last):int(last + avg)])
        last += avg
    return out

X_chunks = [chunkIt(x, n=2) for x in X] 
X_flattened = [s for sublist in X_chunks for s in sublist]
X_flattened = [' '.join(t) for t in X_flattened]

In [7]:
no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(X_flattened)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(X_flattened)
tf_feature_names = tf_vectorizer.get_feature_names()

In [8]:
no_topics = 10 # this number should be set beforehand, and is one of the hyperparameters 
# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0).fit(tf)

In [9]:
# Topics are not labeled by the algorithms a numeric index is assigned.

# this script is to check the top words in each topic cluster
# def display_topics(model, feature_names, no_top_words):
#     for topic_idx, topic in enumerate(model.components_):
#         print("Topic {}:".format(topic_idx))
#         print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

# no_top_words = 10
# display_topics(nmf, tfidf_feature_names, no_top_words)
# display_topics(lda, tf_feature_names, no_top_words)

In [10]:
nmf = nmf.transform(tfidf)
lda = lda.transform(tf)

In [11]:
# def create_topic_feats():
# feats = []

lda_topics = [lda[i].argmax() for i in range(lda.shape[0])]
nmf_topics = [nmf[i].argmax() for i in range(nmf.shape[0])]

In [12]:
topics_feats = list(zip(nmf_topics, lda_topics))
topics_feats = [np.array(t)for t in topics_feats]

In [13]:
len(topics_feats)

7668

In [14]:
topics_feats[0]

array([2, 0])

In [15]:
topics_feats = [np.concatenate((i[0], i[1])) for i in list(zip(topics_feats[0::2], topics_feats[1::2]))]

In [16]:
len(topics_feats)

3834

In [17]:
if TRAIN:
    np.save('topic_feats_10_2chunks.npy', topics_feats)
else:
    np.save('test_topic_feats_10_2chunks.npy', topics_feats)