In [1]:
import re, json, string, datetime, random, itertools
from collections import OrderedDict, defaultdict

import random
import datetime
from gensim import corpora  #https://radimrehurek.com/gensim/
import pandas as pd  #http://pandas.pydata.org/
import numpy as np  #http://www.numpy.org/
import matplotlib.pyplot as plt  #https://matplotlib.org/
import FastLDA
from pSSLDA import infer

In [2]:
with open("Data/depression_lexicon.json") as f:
    seed_terms = json.load(f)

In [39]:
tweets_df=pd.read_csv('Data/tweets_cleaned.csv')
tweets_df['created_at']=pd.to_datetime(tweets_df['created_at'])

In [40]:
tweets_df_50=tweets_df[tweets_df._50==1]
tweets_df_70=tweets_df[tweets_df._70==1]
tweets_df_100=tweets_df[tweets_df._100==1]

In [41]:
users=set(tweets_df['username'].to_list())
users_50=set(tweets_df_50['username'].to_list())
users_70=set(tweets_df_70['username'].to_list())
users_100=set(tweets_df_100['username'].to_list())

In [42]:
tweets_df=tweets_df[["tweet_id", "created_at", "text", "cleaned_text", "polarity_raw",'username']]
tweets_df.columns=["tweet_ID", "created_at", "raw_text", "cleaned_text", "sentiment",'username']
tweets_df_50=tweets_df_50[["tweet_id", "created_at", "text", "cleaned_text", "polarity_raw",'username']]
tweets_df_50.columns=["tweet_ID", "created_at", "raw_text", "cleaned_text", "sentiment",'username']
tweets_df_70=tweets_df_70[["tweet_id", "created_at", "text", "cleaned_text", "polarity_raw",'username']]
tweets_df_70.columns=["tweet_ID", "created_at", "raw_text", "cleaned_text", "sentiment",'username']
tweets_df_100=tweets_df_100[["tweet_id", "created_at", "text", "cleaned_text", "polarity_raw",'username']]
tweets_df_100.columns=["tweet_ID", "created_at", "raw_text", "cleaned_text", "sentiment",'username']

In [43]:
user_sample = random.sample(users_100, 5)

***
##### To emulate PHQ-9 questionare, we bucket tweets based on their creation time with a sliding window of 14 days. Each bucket will then be treated as a document when we run LDA.

In [44]:
def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def create_time_buckets(tweets):
    bucketed_tweets = defaultdict(list)
    counter=0
    for dates in list(chunks(sorted(set(tweets.created_at.dt.date.tolist())), 14)):
        mask = (tweets['created_at'].dt.date >= dates[0]) & (tweets['created_at'].dt.date <= dates[-1])
        df=tweets.loc[mask]
        df=df.drop('username', axis=1)
        for index, tweet in df.iterrows():
            bucketed_tweets[counter].append(tweet)
    return bucketed_tweets

In [45]:
user_bucketed_tweets={}
for user in user_sample:
    user_bucketed_tweets[user] = create_time_buckets(tweets_df[tweets_df['username']==user])

***
##### Prepare the data for LDA from the bucketed tweets

In [46]:
def data_prep(bucketed_tweets):

    texts = list()

    # each bucket is hashed on the start and end date
    for bucket in bucketed_tweets:

        all_bucket_tweets = ""

        for tweet in bucketed_tweets[bucket]:

            try:
                all_bucket_tweets += tweet.cleaned_text + " "
            except:
                pass

        texts.append(all_bucket_tweets.strip().replace("\n", "").split(" "))

    # assign each word a unique ID
    dictionary = corpora.Dictionary(texts)

    # remove gaps in id sequence after words that were removed
    dictionary.compactify()

    voc_size = len(list(dictionary.keys()))

    # replace token ids with the token text in each doc and return similar arry of tokens and docs
    text_as_ids = list()

    # to later be the docvec
    doc_as_ids = list()

    # number of docs here is the number of buckets
    number_of_docs = len(bucketed_tweets)

    for x in range(number_of_docs):

        doc = texts[x]

        for token in doc:
            text_as_ids.append(dictionary.token2id[token])
            doc_as_ids.append(x)

    return text_as_ids, doc_as_ids, voc_size, dictionary.token2id, number_of_docs, bucketed_tweets

In [47]:
# docs for us will be here multiple tweets
user_LDA_inputs={}
for user in user_sample:
    user_LDA_inputs[user] = data_prep(user_bucketed_tweets[user])

***
##### Run LDA 

In [48]:
# NOTE: topics and signals are used in interchangebly, they both mean the same thing.

# calculated the average sentiment of a token based on its occurence in a given set of tweets terms sentiment is therefore taken from the tweet sentiment not targeted sentiment
def get_avg_sentiment(bucketed_tweets, token):

    term_tweets_sent_scores = get_tweets_by_term(bucketed_tweets, token)

    score = 0.0
    count = 0

    for sent_score in term_tweets_sent_scores:
        score += float(sent_score)
        count += 1

    return score / count


def get_tweets_by_term(bucketed_tweets, term):

    term_tweets_sent_scores = list()

    for bucket in bucketed_tweets:
        for tweet in bucketed_tweets[bucket]:
            try:
                if term in tweet.cleaned_text:
                    term_tweets_sent_scores.append(tweet.sentiment)
            except:
                pass

    return term_tweets_sent_scores


def get_topics_terms(tup):

    estphi = tup[0]
    W = tup[1]
    T = tup[2]
    id2token = tup[3]

    # This will contain the mappings of each term to each of our topics
    topics_dict = defaultdict(defaultdict)

    # find the topic where each term is part of W: vocabulary size
    for index in range(W):
        # projects one column of the matrix which contains the weight of the term in all of the topics
        term_weights = estphi[:, index]

        # will contain the largest weight which ->  topic it was assigned to
        largest_weight = 0

        for weight in term_weights:
            if weight > largest_weight:
                largest_weight = weight

        # this will get the index of the topic with largest weight
        term_topic = np.argwhere(term_weights == largest_weight)[0][0]

        topics_dict[term_topic][id2token[index]] = largest_weight
        
    return topics_dict


def get_all_terms_sentiments(id2token, w, bucketed_tweets):

    seed_term_sentiment = defaultdict(float)

    unique_w = list(set(w))

    for wi in unique_w:
        token = id2token[wi]

        if token in seed_terms['signal_1']:
            seed_term_sentiment[token] = get_avg_sentiment(
                bucketed_tweets, token)

        elif token in seed_terms['signal_2']:
            seed_term_sentiment[token] = get_avg_sentiment(
                bucketed_tweets, token)

        elif token in seed_terms['signal_3']:
            seed_term_sentiment[token] = get_avg_sentiment(
                bucketed_tweets, token)

        elif token in seed_terms['signal_4']:
            seed_term_sentiment[token] = get_avg_sentiment(
                bucketed_tweets, token)

        elif token in seed_terms['signal_5']:
            seed_term_sentiment[token] = get_avg_sentiment(
                bucketed_tweets, token)

        elif token in seed_terms['signal_6']:
            seed_term_sentiment[token] = get_avg_sentiment(
                bucketed_tweets, token)

        elif token in seed_terms['signal_7']:
            seed_term_sentiment[token] = get_avg_sentiment(
                bucketed_tweets, token)

        elif token in seed_terms['signal_8']:
            seed_term_sentiment[token] = get_avg_sentiment(
                bucketed_tweets, token)

        elif token in seed_terms['signal_9']:
            seed_term_sentiment[token] = get_avg_sentiment(
                bucketed_tweets, token)

        elif token in seed_terms['signal_10']:
            seed_term_sentiment[token] = get_avg_sentiment(
                bucketed_tweets, token)

    return seed_term_sentiment

In [49]:
def run_LDA(LDA_input, parameters):
    

    token2id = LDA_input[3]

    # number of topics
    T = parameters["topics_count"]

    (wordvec, docvec, zvec) = ([], [], [])

    # vector of words per tweet
    wordvec = LDA_input[0]
    docvec = LDA_input[1]

    # W = vocabulary size
    W = LDA_input[2]

    (w, d) = (np.array(wordvec, dtype = np.int), np.array(docvec, dtype = np.int))

    # Create parameters
    alpha = np.ones((1,T)) * 1
    beta = np.ones((T,W)) * 0.01

    # How many parallel samplers do we wish to use?
    P = 1

    # Random number seed
    randseed =  random.randint(999,999999)

    # Number of samples to take
    numsamp = 50

    # Do parallel inference
    final_z = infer(w, d, alpha, beta, numsamp, randseed, P)

    # number of documents = tweets
    D = LDA_input[4]

    # Estimate phi and theta
    (nw, nd) = FastLDA.countMatrices(w, W, d, D, final_z, T)
    (estphi,esttheta) = FastLDA.estPhiTheta(nw, nd, alpha, beta)

    # swap keys with values in the token2id => id2token
    id2token = dict((v,k) for k,v in token2id.iteritems())

    seed_term_sentiment = get_all_terms_sentiments(id2token, w, LDA_input[5])

    # Now, we add z-labels to *force* words into separate topics
    
    labelweight = 5.0

    label0 = np.zeros((T,), dtype=np.float)
    label0[0] = labelweight

    label1 = np.zeros((T,), dtype=np.float)
    label1[1] = labelweight

    label2 = np.zeros((T,), dtype=np.float)
    label2[2] = labelweight

    label3 = np.zeros((T,), dtype=np.float)
    label3[3] = labelweight

    label4 = np.zeros((T,), dtype=np.float)
    label4[4] = labelweight

    label5 = np.zeros((T,), dtype=np.float)
    label5[5] = labelweight

    label6 = np.zeros((T,), dtype=np.float)
    label6[6] = labelweight

    label7 = np.zeros((T,), dtype=np.float)
    label7[7] = labelweight

    label8 = np.zeros((T,), dtype=np.float)
    label8[8] = labelweight

    label9 = np.zeros((T,), dtype=np.float)
    label9[9] = labelweight

    label10 = np.zeros((T,), dtype=np.float)
    label10[10] = labelweight

    label11 = np.zeros((T,), dtype=np.float)
    label11[11] = labelweight

    # signals ids
    corpus_signals = [0,1,2,3,4,5,6,7,8,9]
   
    seed_terms_per_signal = defaultdict(lambda: defaultdict(int))

    z_labels = []
    for wi in w:

        token = id2token[wi]

        # if the word appears in topic 0
        if token in seed_terms['signal_1'] and  seed_term_sentiment[token] <= 0:

            z_labels.append(label0)

            seed_terms_per_signal['signal_1'][token]+=1

            if 0 in corpus_signals:
                corpus_signals.remove(0)


        elif token in seed_terms['signal_2'] and  seed_term_sentiment[token] <= 0:

            z_labels.append(label1)

            seed_terms_per_signal['signal_2'][token]+=1

            if 1 in corpus_signals:
                corpus_signals.remove(1)


        elif token in seed_terms['signal_3'] and seed_term_sentiment[token] <= 0:

            z_labels.append(label2)

            seed_terms_per_signal['signal_3'][token]+=1

            if 2 in corpus_signals:
                corpus_signals.remove(2)


        elif token in seed_terms['signal_4'] and seed_term_sentiment[token] <= 0:

            z_labels.append(label3)
            seed_terms_per_signal['signal_4'][token]+=1

            if 3 in corpus_signals:
                corpus_signals.remove(3)


        elif token in seed_terms['signal_5'] and seed_term_sentiment[token] <= 0:

            z_labels.append(label4)

            seed_terms_per_signal['signal_5'][token]+=1

            if 4 in corpus_signals:
                corpus_signals.remove(4)

        elif token in seed_terms['signal_6'] and  seed_term_sentiment[token] <= 0:

            z_labels.append(label5)

            seed_terms_per_signal['signal_6'][token]+=1

            if 5 in corpus_signals:
                corpus_signals.remove(5)

        elif token in seed_terms['signal_7'] and  seed_term_sentiment[token] <= 0:

            z_labels.append(label6)

            seed_terms_per_signal['signal_7'][token]+=1

            if 6 in corpus_signals:
                corpus_signals.remove(6)

        elif token in seed_terms['signal_8'] and  seed_term_sentiment[token] <= 0:

            z_labels.append(label7)

            seed_terms_per_signal['signal_8'][token]+=1

            if 7 in corpus_signals:
                corpus_signals.remove(7)

        elif token in seed_terms['signal_9'] and  seed_term_sentiment[token] <= 0:

            z_labels.append(label8)

            seed_terms_per_signal['signal_9'][token]+=1

            if 8 in corpus_signals:
                corpus_signals.remove(8)

        elif token in seed_terms['signal_10'] and  seed_term_sentiment[token] <= 0:

            z_labels.append(label9)

            seed_terms_per_signal['signal_10'][token]+=1

            if 9 in corpus_signals:
                corpus_signals.remove(9)

        else:
            z_labels.append(None)

    # Now inference will find topics with 0 and 1 in separate topics
    final_z = infer(w, d, alpha, beta, numsamp, randseed, P, zlabels = z_labels)

    # Re-estimate phi and theta
    (nw, nd) = FastLDA.countMatrices(w, W, d, D, final_z, T)
    (estphi,esttheta) = FastLDA.estPhiTheta(nw, nd, alpha, beta)

    # Find the sentiment of each topic cluster based on the tweets where each seed term appered in

    tup = (estphi, W, T, id2token)
    topics_terms = get_topics_terms(tup)
    
    sent_scores = defaultdict(list)

    counter = 0
    for topic in topics_terms:

        topic_sent_scores = list()

        for term in topics_terms[topic]:
            term_tweets_sent_scores = get_tweets_by_term(LDA_input[5], term)

            for sent_score in term_tweets_sent_scores:
                 topic_sent_scores.append(float(sent_score))

        avg = sum(topic_sent_scores) / float(len(topic_sent_scores))

        sent_scores[topic] = (topic_sent_scores, avg)
        
        counter+=1

    # post processing of topics. If the bucket has less than 30 tweets then discard the probabilities of that bucket

    len_buckets = []
    for bucket in LDA_input[5]:
        len_b = len(LDA_input[5][bucket])
        len_buckets.append(len_b)

   
    # threshold #1: if number of tweets in that bucket is less than x, then discard that bucket.
    min_number_of_tweets_per_bucket = parameters["min_tweets_per_bucket"]
    
    for x in range(len(len_buckets)):
        if len_buckets[x] <= min_number_of_tweets_per_bucket:
            esttheta[x, :] = 0

    # this will replace zero to the probabilities of the topic by ID if no seed terms were found in the corpus
    for topic_id in corpus_signals:
        esttheta[:, topic_id] = 0

    all_topics_seeds = list()
    for signal in seed_terms_per_signal:
        all_topics_seeds += seed_terms_per_signal[signal]

    # topics to keep
    seeds_in_top_k = defaultdict(int)

    # number of seed terms that should be in the top topic terms
    seeds_threshold = parameters["seeds_threshold"]
    
    # The number of terms in the topic that we will look into to search for seed terms
    top_topic_terms = parameters["top_topic_terms"]

    for topic in topics_terms:
        for x in range(len(topics_terms[topic])):
            term = list(topics_terms[topic])[x]
            if x < top_topic_terms:
                if term in all_topics_seeds:
                    seeds_in_top_k[topic] += 1

    # this will replace zero to the probabilities of the topic by ID if no seed terms were found in the corpus
    for x in range(len(esttheta[0])):
        if x in seeds_in_top_k.keys():
            if seeds_in_top_k[x] < seeds_threshold:
                esttheta[:, x] = 0
        else:
            esttheta[:, x] = 0


    return (estphi, W, T, id2token), esttheta, topics_terms, seed_terms_per_signal

In [50]:
''' 
    topics_count: number of topics/signals to construct using LDA
    min_tweets_per_bucket: minimum number of tweets per bucket to constructs and accept a topic from it
    seeds_threshold: number of seed terms in the top topic terms
    top_topic_terms: the number of terms to consider when searching for seed terms
'''

parameters = {"topics_count": 15, "min_tweets_per_bucket": 20, "seeds_threshold": 2, "top_topic_terms": 25}

user_LDA_outputs={}
for user in user_sample:
    user_LDA_outputs[user] = run_LDA(user_LDA_inputs[user], parameters)

Online z initialization
Assigning documents to partitions
Getting indices associated with each partition
Create re-numbered doc vectors for each partition
Initializing count matrices
Launching Sampler processes
Computing global nw count matrix
Sample 0 of 50
Sample 1 of 50
Sample 2 of 50
Sample 3 of 50
Sample 4 of 50
Sample 5 of 50
Sample 6 of 50
Sample 7 of 50
Sample 8 of 50
Sample 9 of 50
Sample 10 of 50
Sample 11 of 50
Sample 12 of 50
Sample 13 of 50
Sample 14 of 50
Sample 15 of 50
Sample 16 of 50
Sample 17 of 50
Sample 18 of 50
Sample 19 of 50
Sample 20 of 50
Sample 21 of 50
Sample 22 of 50
Sample 23 of 50
Sample 24 of 50
Sample 25 of 50
Sample 26 of 50
Sample 27 of 50
Sample 28 of 50
Sample 29 of 50
Sample 30 of 50
Sample 31 of 50
Sample 32 of 50
Sample 33 of 50
Sample 34 of 50
Sample 35 of 50
Sample 36 of 50
Sample 37 of 50
Sample 38 of 50
Sample 39 of 50
Sample 40 of 50
Sample 41 of 50
Sample 42 of 50
Sample 43 of 50
Sample 44 of 50
Sample 45 of 50
Sample 46 of 50
Sample 47 of 5



Computing global nw count matrix
Sample 0 of 50
Sample 1 of 50
Sample 2 of 50
Sample 3 of 50
Sample 4 of 50
Sample 5 of 50
Sample 6 of 50
Sample 7 of 50
Sample 8 of 50
Sample 9 of 50
Sample 10 of 50
Sample 11 of 50
Sample 12 of 50
Sample 13 of 50
Sample 14 of 50
Sample 15 of 50
Sample 16 of 50
Sample 17 of 50
Sample 18 of 50
Sample 19 of 50
Sample 20 of 50
Sample 21 of 50
Sample 22 of 50
Sample 23 of 50
Sample 24 of 50
Sample 25 of 50
Sample 26 of 50
Sample 27 of 50
Sample 28 of 50
Sample 29 of 50
Sample 30 of 50
Sample 31 of 50
Sample 32 of 50
Sample 33 of 50
Sample 34 of 50
Sample 35 of 50
Sample 36 of 50
Sample 37 of 50
Sample 38 of 50
Sample 39 of 50
Sample 40 of 50
Sample 41 of 50
Sample 42 of 50
Sample 43 of 50
Sample 44 of 50
Sample 45 of 50
Sample 46 of 50
Sample 47 of 50
Sample 48 of 50
Sample 49 of 50




Online z initialization
Assigning documents to partitions
Getting indices associated with each partition
Create re-numbered doc vectors for each partition
Initializing count matrices
Launching Sampler processes
Computing global nw count matrix
Sample 0 of 50
Sample 1 of 50
Sample 2 of 50
Sample 3 of 50
Sample 4 of 50
Sample 5 of 50
Sample 6 of 50
Sample 7 of 50
Sample 8 of 50
Sample 9 of 50
Sample 10 of 50
Sample 11 of 50
Sample 12 of 50
Sample 13 of 50
Sample 14 of 50
Sample 15 of 50
Sample 16 of 50
Sample 17 of 50
Sample 18 of 50
Sample 19 of 50
Sample 20 of 50
Sample 21 of 50
Sample 22 of 50
Sample 23 of 50
Sample 24 of 50
Sample 25 of 50
Sample 26 of 50
Sample 27 of 50
Sample 28 of 50
Sample 29 of 50
Sample 30 of 50
Sample 31 of 50
Sample 32 of 50
Sample 33 of 50
Sample 34 of 50
Sample 35 of 50
Sample 36 of 50
Sample 37 of 50
Sample 38 of 50
Sample 39 of 50
Sample 40 of 50
Sample 41 of 50
Sample 42 of 50
Sample 43 of 50
Sample 44 of 50
Sample 45 of 50
Sample 46 of 50
Sample 47 of 5

Sample 48 of 50
Sample 49 of 50


In [55]:
def detect_depression(LDA_output, user):
    
    try:

        esttheta = LDA_output[1]
        
        #print ("Topics Probabilties Over Time")
                
        headers = ["Time Period", "Signal-1", "Signal-2", "Signal-3", "Signal-4", "Signal-5",
                                  "Signal-6", "Signal-7", "Signal-8", "Signal-9", "Signal-10"]
        
        rows = list()
        
        counter = 0
        for key in user_bucketed_tweets[user].keys():
            
            df = pd.DataFrame(user_bucketed_tweets[user][key])
                        
            bucket_date = str(df.created_at.min().strftime("%d/%m/%Y")) + " To " + \
                          str(df.created_at.max().strftime("%d/%m/%Y"))
            
            row = [bucket_date] + [esttheta[counter][x] for x in range(len(esttheta[counter])) if x < 10]
            
            rows.append(row)

            # increment counter to get element from the result matrix
            counter+=1

        topics_probabilities = pd.DataFrame(rows, columns=headers)
        
        #print (topics_probabilities)
        
        #topics_probabilities.plot(kind='line')
        #plt.show()
                
        #print ("Topics Terms")
         

        
        headers = ["Topic Number", "Topic Terms"]
        rows = list()
        
        for topic in LDA_output[2]:

            topic_nbr = topic+1
            
            rows.append([topic_nbr, ", ".join(LDA_output[2][topic])])

        topics_terms = pd.DataFrame(rows, columns=headers)
                    
        #print (topics_terms)

        #print ("Seeded Terms Per Topic")

        headers = ["Topic Number", "Seed Terms:Count"]
        rows = list()
        
        # seed_terms_per_signal
        for topic in LDA_output[3]:
            
            seedTerms = [str(seedTerm)+":"+str(LDA_output[3][topic][seedTerm]) 
                                         for seedTerm in LDA_output[3][topic]]
            
            rows.append([topic, ", ".join(seedTerms)])
        
        topics_seeds = pd.DataFrame(rows, columns=headers)
                    
        #print topics_seeds
        return topics_probabilities, topics_terms, topics_seeds
    except AssertionError:
        print ("number of tweets is insufficents for depression detection!")

In [56]:
user_depression_outputs={}
for user in user_sample:
    user_depression_outputs[user]=detect_depression(user_LDA_outputs[user],user)

In [57]:
user_sample

['amysav83', 'Kikirowr', 'MiDesfileNegro', 'thisgoeshere', 'nuttychris']

In [58]:
user_depression_outputs['amysav83'][0]

Unnamed: 0,Time Period,Signal-1,Signal-2,Signal-3,Signal-4,Signal-5,Signal-6,Signal-7,Signal-8,Signal-9,Signal-10
0,07/04/2009 To 25/06/2009,0.0,0.0,0.040733,0.0611,0.0,0.0,0.0,0.0,0.13442,0.0


In [59]:
user_depression_outputs['amysav83'][1]

Unnamed: 0,Topic Number,Topic Terms
0,1,"quack, mum, ready, code, goto, shorts, food, g..."
1,2,"moz, aaawww, missin, nite, disappointed, twitt..."
2,3,"go_to_bed, deserved, thunder, scared, ages, ye..."
3,4,"aaww, ghost, rice, hows, doin, tired, need_to_..."
4,5,"pouring, evening, yas, sore, pic, wk, plane, m..."
5,6,"kids, england, hoping, turn, feel, shift, prin..."
6,7,"week, thankies, waaa, yeeey, ohh, oohh, fair, ..."
7,8,"weather, afta, appreciating, shit, luv, angry,..."
8,9,"nurses, wil, dead, num, funny, nights, working..."
9,10,"pills, claim, sooo, service, shame, aww, haha,..."


In [60]:
user_depression_outputs['amysav83'][2]

Unnamed: 0,Topic Number,Seed Terms:Count
0,signal_9,"dead:1, hurt:1"
1,signal_8,angry:1
2,signal_3,"awake:1, need_to_sleep:1, go_to_bed:1"
3,signal_2,disappointed:1
4,signal_4,tired:3
5,signal_10,pills:1
