In [1]:
import pandas as pd, numpy as np
import json
from nltk.tokenize import TweetTokenizer

Let's first read in the data. Our data is borrowed from https://github.com/AshwanthRamji/Depression-Sentiment-Analysis-with-Twitter-Data, and corresponds to parsed tweets. We can filter the tweets to only consider tweets with negative sentiment

In [2]:
sentiment = pd.read_csv("data/sentiment.csv").set_index("id")["sentiment"]
tweets = []
f = open("data/tweetdata.txt",)
for line in f.readlines():
    line = line.strip()
    if len(line) == 0:
        continue
    curr_data = json.loads(line)
    id_ = int(curr_data["id_str"])
    if id_ in sentiment and sentiment[id_] == -1:
        tweet = curr_data["text"].lower()
        if "spinner" not in tweet or "hand" not in tweet: # manually filter spam
            tweets.append(tweet)

tweets = np.unique(tweets)

Let's now tokenize all of the tweets.

In [3]:
tk = TweetTokenizer()
tokenized = [["<s>"] + tk.tokenize(tweet) + ["</s>"] for tweet in tweets]

In [4]:
# Gets all of the ngrams and their counts for the given dataset
def get_ngram_counts(tweets, n):
    counts = {}
    for tweet in tweets:
        for i in range(len(tweet) - n + 1):
            ngram = " ".join(tweet[i:i+n])
            counts[ngram] = counts.get(ngram, 0) + 1
    return counts

For this demo, we will be using n = 3 for our ngrams. Let's compute the counts of ngrams and print out some of the most prevalent ones.

In [5]:
counts = get_ngram_counts(tokenized, 3)
for gram, count in counts.items():
    if count > 10:
        print(gram, count)

get rid of 13
depression . </s> 21
8 depression myths 12
depression myths we 12
myths we need 12
we need to 13
need to stop 12
to stop believing 12
stop believing ... 12
the great depression 12
my anxiety is 19
mental health issues 12
so much anxiety 14
stress and anxiety 12
, depression , 11
<s> how to 16
: me : 11
<s> my anxiety 18


Let's convert the ngrams to a filterable dataframe.

In [6]:
ngrams_df = pd.DataFrame(counts.items(), columns=["ngram", "count"])

start_ngrams = ngrams_df[ngrams_df.ngram.str.startswith("<s>")].copy()
start_ngrams["count"] /= start_ngrams["count"].sum()

other_ngrams = ngrams_df[~ngrams_df.ngram.str.startswith("<s>")].copy()

We can now write our tweet generation function.

In [7]:
def gen_tweet(start_ngrams, other_ngrams, n, gen_count):
    for i in range(gen_count):
        start = np.random.choice(start_ngrams["ngram"], p=start_ngrams["count"]).split(" ")
        tweet = start

        while tweet[-1] != "</s>":
            curr_choices = other_ngrams[other_ngrams.ngram.str.startswith(" ".join(tweet[-n+1:]) + " ")].copy()
            curr_choices["count"] /= curr_choices["count"].sum()
            curr = np.random.choice(curr_choices["ngram"], p=curr_choices["count"]).split(" ")
            tweet.append(curr[-1])

        print(" ".join(tweet[1:-1]))

In [8]:
gen_tweet(start_ngrams, other_ngrams, 3, 10)

rt @alyciatyre : my heart goes out to those of us whose anxiety has gone from crippling to an accelerating vomit / shit / deathlike ever since …
depression , but i don't wish them upon anybody .
que anxiety ni que la chingada ponte a limpiar 😂 😂 😋 by alot of using their minds 😂 😂
i have depression " just because i'm thinking of something more important to me feeling so tired and sick lately i haven't been able to go in @checkpointorg ' s kami dvorakova …
post 1am depression twitter https://t.co/ibmpn4kn8l
rt @kbelliard_ : nothing hurts more than depression , they're not thinking about life , you know what's really fun about bipolar disorder - anxiety is acting so badly rn lmao
rt @playstationau : 24 hours to go to a therapist every month and always have panic attacks and anxiety https://t.co/gmhp3ldode
my parents feel like there's something missing and that's why my depression ? https://t.co/tg3oxxndgl
talkin bout cancer 😩 😩 😩 😩 😩 😩 😩 😩 😩
i don't wanna talk until tomorrow 🎶
