---
##### Import all the necessary libraries

In [1]:
import re
import json
import string
import datetime
import itertools
from collections import defaultdict

from wordsegment import load, segment
from nltk import TweetTokenizer
from nltk.corpus import stopwords
from textblob import TextBlob
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from datetime import datetime
import tqdm

load()

---
##### Load the depression lexicon to seed the LDA topics

In [2]:
# reading Depression PHQ-9 Lexicon
with open("depression_lexicon.json") as f:
    seed_terms = json.load(f)

# read all seed terms into a list removing the underscore from all seeds
seed_terms_col = [
    seed.replace("_", " ") for seed in list(
        itertools.chain.from_iterable(
            [seed_terms[signal] for signal in seed_terms.keys()]))
]

---
##### Prepare other lexicons and resources required to filter and pre-process the tweets

In [3]:
# Other lexicons and resources
emojies = [":‑)", ":)", ":D", ":o)", ":]", ":3", ":c)", ":>", "=]", "8)", "=)", ":}", ":^)", ":っ)",
           ":‑D", "8‑D", "8D", "x‑D", "xD", "X‑D", "XD", "=‑D", "=D", "=‑3", "=3", "B^D", ":-))", ">:[",
           ":‑(", ":(", ":‑c", ":c", ":‑<", ":っC", ":<", ":‑[", ":[", ":{", ";(", ":-||", ":@", ">:(",
           ":'‑(", ":'(", ":'‑)", ":')", "D:<", "D:", "D8", "D;", "D=", "DX", "v.v", "D‑':", ">:O", ":‑O",
           ":O", ":‑o", ":o", "8‑0", "O_O", "o‑o", "O_o", "o_O", "o_o", "O-O", ":*", ":-*", ":^*", "(", "}{'",
           ")", ";‑)", ";)", "*-)", "*)", ";‑]", ";]", ";D", ";^)", ":‑,", ">:P", ":‑P", ":P", "X‑P", "x‑p",
           "xp", "XP", ":‑p", ":p", "=p", ":‑Þ", ":Þ", ":þ", ":‑þ", ":‑b", ":b", "d:", ">:\\", ">:/", ":‑/",
           ":‑.", ":/", ":\\", "=/", "=\\", ":L", "=L", ":S", ">.<", ":|", ":‑|", ":$", ":‑X", ":X", ":‑#",
           ":#", "O:‑)", "0:‑3", "0:3", "0:‑)", "0:)", "0;^)", ">:)", ">;)", ">:‑)", "}:‑)", "}:)", "3:‑)",
           "3:)", "o/\o", "^5", ">_>^", "^<_<", "|;‑)", "|‑O", ":‑J", ":‑&", ":&", "#‑)", "%‑)", "%)",
           ":‑###..", ":###..", "<:‑|", "<*)))‑{", "><(((*>", "><>", "\o/", "*\0/*", "@}‑;‑'‑‑‑", "@>‑‑>‑‑",
           "~(_8^(I)", "5:‑)", "~:‑\\", "//0‑0\\\\", "*<|:‑)", "=:o]", "7:^]", ",:‑)", "</3", "<3"]


tweet_token = TweetTokenizer(
    preserve_case=True, reduce_len=True, strip_handles=True)

printable = set(string.printable)

punctuation = list(string.punctuation)
punctuation.remove("-")
punctuation.remove('_')

stop_words_extended = [
    "a's", "abaft", "able", "aboard", "above", "abst", "accordance", "according", "accordingly", "across", "act", "actually",
    "added", "adj", "affected", "affecting", "affects", "afore", "aforesaid", "afterwards", "against", "agin", "ago", "ah",
    "ain't", "aint", "albeit", "allow", "allows", "almost", "alone", "along", "alongside", "already", "also", "although",
    "always", "american", "amid", "amidst", "among", "amongst", "and", "anent", "announce", "another", "anybody", "anyhow",
    "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "apart", "apparently", "appear", "appreciate",
    "appropriate", "approximately", "aren", "arent", "arise", "around", "aside", "ask", "asking", "aslant", "associated",
    "astride", "athwart", "auth", "available", "away", "awfully", "b", "back", "bar", "barring", "became", "become",
    "becomes", "becoming", "before", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "believe",
    "beneath", "beside", "besides", "best", "better", "betwixt", "beyond", "biol", "brief", "briefly", "by", "c", "c'mon",
    "c's", "ca", "came", "can't", "cannot", "cant", "cause", "causes", "certain", "certainly", "changes", "circa", "clearly",
    "close", "co", "com", "come", "comes", "concerning", "consequently", "consider", "considering", "contain", "containing",
    "contains", "corresponding", "cos", "could", "couldn't", "couldnt", "couldst", "course", "currently", "dare", "dared",
    "daren", "dares", "daring", "date", "definitely", "described", "despite", "didn", "different", "directly", "does",
    "doesn't", "don", "done", "dost", "doth", "downwards", "due", "durst", "e", "early", "ed", "edu", "effect", "eg",
    "eight", "eighty", "either", "else", "elsewhere", "em", "end", "ending", "english", "enough", "entirely", "er",
    "ere", "especially", "et", "et-al", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere",
    "ex", "exactly", "example", "except", "excepting", "f", "failing", "far", "ff", "fifth", "first", "five", "fix",
    "followed", "following", "follows", "former", "formerly", "forth", "found", "four", "further", "furthermore", "g",
    "gave", "get", "gets", "getting", "give", "given", "gives", "giving", "go", "goes", "going", "gone", "gonna", "got",
    "gotta", "gotten", "greetings", "h", "hadn", "happens", "hard", "hardly", "hasn", "hast", "hath", "haven", "having",
    "he'd", "he'll", "he's", "hed", "hello", "help", "hence", "here", "here's", "hereafter", "hereby", "herein", "heres",
    "hereupon", "herself", "hes", "hi", "hid", "high", "himself", "hither", "home", "hopefully", "how's", "howbeit",
    "however", "hundred", "i'd", "i'll", "i'm", "i've", "id", "ie", "ignored", "ill", "im", "immediate", "immediately",
    "importance", "important", "inasmuch", "inc", "indeed", "index", "indicate", "indicated", "indicates", "information",
    "inner", "inside", "insofar", "instantly", "instead", "invention", "inward", "isn", "it", "it'd", "it'll", "itd",
    "itself", "j", "k", "keep", "keeps", "kept", "kg", "km", "know", "known", "knows", "l", "large", "largely", "last",
    "lately", "later", "latter", "latterly", "least", "left", "less", "lest", "let", "let's", "lets", "like", "liked",
    "likely", "likewise", "line", "little", "living", "long", "look", "looking", "looks", "ltd", "made", "mainly", "make",
    "makes", "many", "may", "maybe", "mayn", "mean", "means", "meantime", "meanwhile", "merely", "mg", "mid", "midst",
    "might", "million", "mine", "minus", "miss", "ml", "moreover", "mostly", "mr", "mrs", "much", "mug", "must", "mustn't",
    "myself", "n", "na", "name", "namely", "nay", "nd", "near", "nearly", "neath", "necessarily", "necessary", "need",
    "needed", "needing", "needs", "neither", "never", "nevertheless", "new", "next", "nigh", "nigher", "nighest", "nine",
    "ninety", "nisi", "nobody", "non", "none", "nonetheless", "noone", "normally", "nos", "noted", "nothing",
    "notwithstanding", "novel", "nowhere", "obtain", "obtained", "obviously", "off", "often", "oh", "ok", "okay", "old",
    "omitted", "once", "one", "ones", "oneself", "onto", "open", "ord", "others", "otherwise", "ought", "oughtn", "ours",
    "out", "outside", "overall", "owing", "p", "page", "pages", "part", "particular", "particularly", "past", "pending",
    "per", "perhaps", "placed", "please", "plus", "poorly", "possible", "possibly", "potentially", "pp", "predominantly",
    "present", "presumably", "previously", "primarily", "probably", "promptly", "proud", "provided", "provides", "providing",
    "public", "put", "q", "qua", "que", "quickly", "quite", "qv", "r", "ran", "rather", "rd", "readily", "real", "really",
    "reasonably", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively",
    "research", "respecting", "respectively", "resulted", "resulting", "results", "right", "round", "run", "said", "sans",
    "save", "saving", "saw", "say", "saying", "says", "sec", "second", "secondly", "section", "see", "seeing", "seem",
    "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several",
    "shall", "shalt", "shan't", "she'd", "she'll", "shed", "shell", "shes", "short", "shouldn", "show", "showed", "shown",
    "showns", "shows", "significant", "significantly", "similar", "similarly", "since", "six", "slightly", "small", "some",
    "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon",
    "sorry", "special", "specifically", "specified", "specify", "specifying", "still", "stop", "strongly", "sub",
    "substantially", "successfully", "sufficiently", "suggest", "summat", "sup", "supposing", "sure", "t's", "take",
    "taken", "taking", "tell", "tends", "th", "thank", "thanks", "thanx", "that'll", "that's", "that've", "thats", "thee",
    "theirs", "themselves", "thence", "there'll", "there's", "there've", "thereafter", "thereby", "thered", "therefore",
    "therein", "thereof", "therere", "theres", "thereto", "thereupon", "they", "they'd", "they'll", "they're", "they've",
    "theyd", "theyre", "thine", "think", "third", "tho", "thorough", "thoroughly", "thou", "though", "thoughh", "thousand",
    "three", "thro", "throug", "throughout", "thru", "thus", "thyself", "til", "till", "tip", "today", "together", "took",
    "touching", "toward", "towards", "tried", "tries", "true", "truly", "try", "trying", "ts", "twas", "tween", "twere",
    "twice", "twill", "twixt", "two", "twould", "u", "un", "underneath", "unfortunately", "unless", "unlike", "unlikely",
    "unto", "upon", "ups", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "v", "value",
    "various", "versus", "via", "vice", "vis-a-vis", "viz", "vol", "vols", "vs", "w", "wanna", "want", "wanting", "wants",
    "wasn", "wasnt", "way", "we'd", "we'll", "we're", "we've", "wed", "welcome", "well", "went", "weren", "werent", "wert",
    "what'll", "what's", "whatever", "whats", "when's", "whence", "whencesoever", "whenever", "where's", "whereafter",
    "whereas", "whereby", "wherein", "wheres", "whereupon", "wherever", "whether", "whichever", "whichsoever", "whilst",
    "whim", "whither", "who'll", "who's", "whod", "whoever", "whole", "whomever", "whore", "whos", "whose", "whoso",
    "whosoever", "why's", "widely", "willing", "wish", "within", "without", "wonder", "wont", "words", "world", "would",
    "wouldn't", "wouldnt", "wouldst", "www", "x", "ye", "yes", "yet", "you'd", "you're", "youd", "youre", "yourself", "z", "zero"
]
stop_words_extended = list(
    set(stop_words_extended + punctuation + list(stopwords.words('english'))))

---

##### Load and clean the 1.6M tweets data

In [4]:
tweets_df = pd.read_csv('Data/tweets.csv', encoding="ISO-8859-1",
                        names=["sentiment", "tweet_id", "created_at", "query", "username", "text"])

In [5]:
def convert_date(date):
    return datetime.strptime(date.replace(' PDT', ''), "%a %b %d %H:%M:%S %Y")

In [6]:
tweets_df['created_at'] = tweets_df['created_at'].apply(convert_date)

tweets_df = tweets_df.sort_values(
    ["username", "created_at"]).reset_index(drop=True)

In [7]:
user_tweet_counts=tweets_df[['tweet_id', 'username', 'created_at']].groupby(['username']).agg('count').reset_index()

In [8]:
users_50 = list(user_tweet_counts['username'][user_tweet_counts['tweet_id']>=50])
users_70 = list(user_tweet_counts['username'][user_tweet_counts['tweet_id']>=70])
users_100 = list(user_tweet_counts['username'][user_tweet_counts['tweet_id']>=100])

In [9]:
def user_tweets_50(username):
    if username in users_50:
        return 1
    else:
        return 0

def user_tweets_70(username):
    if username in users_70:
        return 1
    else:
        return 0

def user_tweets_100(username):
    if username in users_100:
        return 1
    else:
        return 0

def user_tweets_180(username):
    if username in users_180:
        return 1
    else:
        return 0

In [10]:
tweets_df['_50'] = tweets_df['username'].apply(user_tweets_50)
tweets_df['_70'] = tweets_df['username'].apply(user_tweets_70)
tweets_df['_100'] = tweets_df['username'].apply(user_tweets_100)

In [11]:
tweets_df=tweets_df.drop_duplicates()

***
##### Pre-process tweets by filtering the text and recording the sentiments of each tweet

In [12]:
analyzer = SentimentIntensityAnalyzer()


def deEmojify(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"
                               u"\U0001F300-\U0001F5FF"
                               u"\U0001F680-\U0001F6FF"
                               u"\U0001F1E0-\U0001F1FF"
                               u"\U00002500-\U00002BEF"
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


def de_abbreviate(token):
    
    if token == 'u':
        return 'you'
    if token == 'r':
        return 'are'
    if token == 'some1':
        return 'someone'
    if token == 'yrs':
        return 'years'
    if token == 'hrs':
        return 'hours'
    if token == 'mins':
        return 'minutes'
    if token == 'secs':
        return 'seconds'
    if token == 'pls' or token == 'plz':
        return 'please'
    if token == '2morow' or token == '2moro':
        return 'tomorrow'
    if token == '2day':
        return 'today'
    if token == '4got' or token == '4gotten':
        return 'forget'
    if token in ['hahah', 'hahaha', 'hahahaha']:
        return 'haha'
    if token == "mother's":
        return "mother"
    if token == "mom's":
        return "mom"
    if token == "dad's":
        return "dad"
    if token == 'bday' or token == 'b-day':
        return 'birthday'
    if token in ["i'm", "don't", "can't", "couldn't", "aren't", "wouldn't", "isn't", "didn't", "hadn't",
                 "doesn't", "won't", "haven't", "wasn't", "hasn't", "shouldn't", "ain't", "they've"]:
        return token.replace("'", "")
    if token in ['lmao', 'lolz', 'rofl']:
        return 'lol'
    if token == '<3':
        return 'love'
    if token == 'thanx' or token == 'thnx':
        return 'thanks'
    if token == 'goood':
        return 'good'
    if token in ['amp', 'quot', 'lt', 'gt', '½25', '..', '. .', '. . .', '...']:
        return ' '

    else:
        return token


def de_slang(tweet):

    tweet = tweet.replace("idk", "i dont know")
    tweet = tweet.replace("i'll", "i will")
    tweet = tweet.replace("you'll", "you will")
    tweet = tweet.replace("we'll", "we will")
    tweet = tweet.replace("it'll", "it will")
    tweet = tweet.replace("it's", "it is")
    tweet = tweet.replace("i've", "i have")
    tweet = tweet.replace("you've", "you have")
    tweet = tweet.replace("we've", "we have")
    tweet = tweet.replace("they've", "they have")
    tweet = tweet.replace("you're", "you are")
    tweet = tweet.replace("we're", "we are")
    tweet = tweet.replace("they're", "they are")
    tweet = tweet.replace("let's", "let us")
    tweet = tweet.replace("she's", "she is")
    tweet = tweet.replace("he's", "he is")
    tweet = tweet.replace("that's", "that is")
    tweet = tweet.replace("i'd", "i would")
    tweet = tweet.replace("you'd", "you would")
    tweet = tweet.replace("there's", "there is")
    tweet = tweet.replace("what's", "what is")
    tweet = tweet.replace("how's", "how is")
    tweet = tweet.replace("who's", "who is")
    tweet = tweet.replace("y'all", "you all")
    tweet = tweet.replace("ya'll", "you all")

    return tweet


def preprocess_text(tweet):

    # replace seeds (as phrases) to unigrams.
    for seed in seed_terms_col:
        if seed in tweet and " " in seed:
            tweet = tweet.replace(seed, seed.replace(" ", "_"))

    # remove retweet handler
    if tweet[:2] == "RT":
        tweet = tweet[tweet.index(":") + 2:]

    # remove url from tweet
    tweet = re.sub(
        r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', tweet)

    # remove short notations
    tweet = de_slang(tweet)

    # remove non-ascii characters
    tweet = ''.join((filter(lambda x: x in printable, tweet)))

    # additional preprocessing
    tweet = tweet.replace("\n", " ").replace(" https", "").replace("http", "")

    # remove all mentions
    mentions = re.findall(r"@\w+", tweet)
    for mention in mentions:
        tweet = tweet.replace(mention, "")

    # clean usernames and hashtags
    for term in re.findall(r"#\w+", tweet):

        # remove any punctuations from the hashtag and mention
        token = term[1:].translate(str.maketrans('', '', string.punctuation))

        segments = ' '.join(segment(token))

        tweet = tweet.replace(term, segments)

    # remove all punctuations
    tweet = re.sub(r"""
               ["""+"".join(punctuation)+"""]+
               """,
                   " ",
                   tweet, flags=re.VERBOSE)

    # remove trailing spaces
    tweet = tweet.strip()

    # remove numbers
    tweet = re.sub(r'[\d-]+', 'NUM', tweet)

    # pad NUM with spaces
    tweet = tweet.replace("NUM", " NUM ")

    # remove emoticons
    tweet = deEmojify(tweet)

    # remove all stop words or emojis
    tweet = " ".join([de_abbreviate(word.lower()) for word in tweet_token.tokenize(tweet) if word.lower(
    ) not in stop_words_extended and word.lower() not in emojies and len(word) > 1])

    # remove multiple spaces
    tweet = re.sub(' +', ' ', tweet)

    return tweet


def preprocess(tweets):

    processed_tweets = []

    for index, tweet in tqdm.tqdm(tweets.iterrows()):
        cleaned_text = preprocess_text(tweet['text'])
        sent_score = TextBlob(tweet['text']).sentiment.polarity
        vader_compound_score = analyzer.polarity_scores(tweet['text'])[
            'compound']
        vader_positive_score = analyzer.polarity_scores(tweet['text'])['pos']
        vader_negative_score = analyzer.polarity_scores(tweet['text'])['neg']
        vader_neutral_score = analyzer.polarity_scores(tweet['text'])['neu']
        sent_score_2 = TextBlob(cleaned_text).sentiment.polarity
        vader_compound_score_2 = analyzer.polarity_scores(cleaned_text)[
            'compound']
        vader_positive_score_2 = analyzer.polarity_scores(cleaned_text)['pos']
        vader_negative_score_2 = analyzer.polarity_scores(cleaned_text)['neg']
        vader_neutral_score_2 = analyzer.polarity_scores(cleaned_text)['neu']

        processed_tweets.append([tweet['tweet_id'], tweet['created_at'], tweet['text'], cleaned_text, sent_score, vader_compound_score, vader_positive_score,
                                 vader_neutral_score, vader_negative_score, sent_score_2, vader_compound_score_2, vader_positive_score_2, vader_neutral_score_2, vader_negative_score_2])

    return pd.DataFrame(processed_tweets, columns=['tweet_id', 'created_at', 'text', 'cleaned_text', 'polarity_raw', 'vader_compound_raw', 'vader_pos_raw',
                                                   'vader_neu_raw', 'vader_neg_raw', 'polarity_cleaned', 'vader_compound_cleaned', 'vader_pos_cleaned', 'vader_neu_cleaned', 'vader_neg_cleaned'])

In [13]:
preprocessed_tweets = preprocess(tweets_df[["tweet_id", "created_at", "text"]])

6838it [00:14, 456.47it/s]


***
##### Merge the tweets to get the usernames, and filter for tweets count

In [14]:
preprocessed_tweets=pd.merge(preprocessed_tweets, tweets_df[["tweet_id","created_at","username","_50","_70", "_100"]], on=["tweet_id",'created_at'])
preprocessed_tweets=preprocessed_tweets.drop_duplicates()
preprocessed_tweets = preprocessed_tweets.sort_values(["username", "created_at"]).reset_index(drop=True)

In [15]:
preprocessed_tweets.to_csv('Data/tweets_cleaned.csv', header=True, index=False)