In [1]:
from gensim.models.word2vec import Word2Vec
from nltk.tokenize import TweetTokenizer
import rake
from keywords import TweetKeywords

from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

import time
from random import shuffle
from itertools import chain
import numpy as np
import scipy




In [2]:
conf = (SparkConf().setMaster("local").setAppName("twitter-app").set("spark.kryoserializer.buffer.max", "1g"))
sc = SparkContext(conf=conf)
sql_context = SQLContext(sc)

In [7]:
start = time.time()
model = Word2Vec.load_word2vec_format('../../word2vec_twitter_model/word2vec_twitter_model.bin',binary=True, encoding='latin-1')
print time.time() - start

71.337321043


In [8]:
# averages words vectors for a tweet
def average_word_vecs(words):
    vecs = []
    for word in words:
        word = word.replace('\n', '')
        try:
            vecs.append(model[word]) #.reshape((1,size_dimension))
        except KeyError:
            continue
    if len(vecs) == 0:
        return None
    else:
        vecs = np.average(vecs,axis=0)
        return np.array(vecs, dtype='float') #TSNE expects float type values

In [9]:
# vecs: an array of real vectors
def cosine_cluster(vecs, min_similarity):
    cluster_vec = []         # tracks sum of vectors in a cluster
    cluster_idx_master = []         # array of index arrays. e.g. [[1, 3, 5], [2, 4, 6]]
    n_cluster = 0
    N = len(vecs)
    
    for i in range(1, N):
        max_similarity = -np.inf
        max_idx = 0
        v = vecs[i]
        if not all(t==0 for t in v):
            cluster_sims = [1-scipy.spatial.distance.cosine(v, cluster_vec[j]) for j in range(n_cluster)]

            if len(cluster_sims)==0: cluster_sims.append(max_similarity)
            max_similarity = max(cluster_sims)
            #print cluster_sims
            if max_similarity < min_similarity:
                # create new cluster
                cluster_vec.append(v)
                cluster_idx_master.append([i])
                n_cluster += 1
            else:
                test = [idx for idx, sim in enumerate(cluster_sims) if sim == max_similarity]
                cluster_idx = [idx for idx, sim in enumerate(cluster_sims) if sim == max_similarity][0]
                cluster_vec[cluster_idx] = np.add(cluster_vec[cluster_idx], v)
                cluster_idx_master[cluster_idx].append(i)
    return cluster_idx_master

In [10]:
tknzr = TweetTokenizer(preserve_case=True,reduce_len=True,strip_handles=True)
keyword_extractor = TweetKeywords(tknzr)

In [11]:
corpus = sql_context.read.json('../../lowes.json')

In [12]:
corpus.select(['text']).take(2)

[Row(text=u'@Caljammr home remodeling project! Buying 6k floors from Lowes. Chase ink plus to earn 5x points on gift cards at staples!'),
 Row(text=u'RT @SCStocks: $ECOB Cheap stock at .0002 #LotteryTicket ready to lift finally. Lockout in place #Sales #HomeDepot #Lowes #RT\nhttps://t.co/K\u2026')]

In [13]:
clean_tokenize_udf = udf(keyword_extractor.tweet_tokenizer,returnType=ArrayType(StringType(),False))

In [14]:
corpus = corpus.select('*',clean_tokenize_udf('text').alias('clean_text')).select('clean_text')

In [15]:
corpus.take(2)

[Row(clean_text=[u'home', u'remodeling', u'project', u'Buying', u'6k', u'floors', u'from', u'Lowes', u'Chase', u'ink', u'plus', u'to', u'earn', u'5x', u'points', u'on', u'gift', u'cards', u'at', u'staples']),
 Row(clean_text=[u'ECOB', u'Cheap', u'stock', u'at', u'0002', u'LotteryTicket', u'ready', u'to', u'lift', u'finally', u'Lockout', u'in', u'place', u'Sales', u'HomeDepot', u'Lowes', u'RT'])]

In [16]:
corpus.count()

5806

In [17]:
# remove retweets and duplicates
corpus.drop_duplicates().count()

4923

In [18]:
corpus = corpus.drop_duplicates()

In [19]:
start = time.time()
all_docs = corpus.toPandas()['clean_text'].values.tolist()#.collect()
print "seconds:", time.time() - start

seconds: 2.95780515671


In [20]:
shuffle(all_docs)

In [21]:
start = time.time()
vec_list = []
tweet = []
for doc in all_docs:
    docvec = average_word_vecs(doc)
    if docvec == None:
        continue
    else:
        vec_list.append(docvec)
        tweet.append(doc)
print time.time() - start

0.347715854645




In [22]:
len(vec_list)

4914

In [23]:
len(tweet)

4914

In [24]:
start = time.time()
cluster_results = cosine_cluster(vec_list, .7)
print time.time() - start

36.5452730656


In [25]:
clusters = []
for index, clus in enumerate(cluster_results):
    clusters.append((index, len(clus)))

In [26]:
clusters.sort(cmp=None,key=lambda x: x[1],reverse=True)

In [27]:
# cluster numbers and lengths
clusters[:10]

[(1, 2102),
 (12, 457),
 (20, 423),
 (10, 318),
 (19, 251),
 (5, 230),
 (22, 129),
 (3, 121),
 (25, 107),
 (141, 94)]

In [28]:
num_characters = 3
max_phrase = 3
remove_repeats = True
for tup in clusters[:10]:
    tweet_list = []
    for index in cluster_results[tup[0]]:
        tweet_list.append(tweet[index])
    print 'Cluster number:{}'.format(tup[0]), "Cluster size:{}".format(tup[1])
    print "keywords:", keyword_extractor.keywords_from_tweet_list(tweet_list,num_characters, max_phrase,remove_repeats)[:10], "\n"

Cluster number:1 Cluster size:2102
keywords: [(u'tension relieving force', 9.0), (u'indoor bug protection', 8.666666666666666), (u'mplusplaces download today', 7.766666666666666), (u'latest baby news', 7.619047619047619), (u'major trump supporter', 7.583333333333334), (u'innovation end cap', 7.111111111111111), (u'lowes gift cards', 6.984452122408687), (u'lowes hires man', 6.734452122408687), (u'lowes parking lot', 6.688997576954142), (u'local hardware store', 6.2606958762886595)] 

Cluster number:12 Cluster size:457
keywords: [(u'franco morbidelli completed', 9.0), (u'replace bradley smith', 9.0), (u'armour hat cap', 9.0), (u'making motogp debu', 8.5), (u'complete motogp rider', 8.5), (u'armour nascar race', 8.166666666666666), (u'kalex moto mac', 8.0), (u'donington alex lowes', 7.9), (u'motogp racing motorsports', 7.5), (u'zarco lowes 2nd', 6.9)] 

Cluster number:20 Cluster size:423
keywords: [(u'favorite sh big', 9.0), (u'donthecon triggers call', 9.0), (u'peak lab rescues', 9.0), (