# Imports

In [1]:
%pip install tweepy --upgrade

import tweepy
import random
import configparser
import numpy as np
import re
import time
import html
from termcolor import colored
from collections import Counter

Note: you may need to restart the kernel to use updated packages.


# Download and Clean Tweets

Load tweepy client and read api keys

In [3]:
config = configparser.ConfigParser()
config.read('config.ini')
tokens = config['Tokens']
client = tweepy.Client(
  bearer_token=tokens['BearerToken'],
  consumer_key=tokens['ConsumerKey'],
  consumer_secret=tokens['ConsumerSecret'],
  access_token=tokens['AccessToken'],
  access_token_secret=tokens['AccessTokenSecret'],
)

Download tweets of followed users

In [4]:
tweets = []

# get user ids of a specified user's followed accounts
root_user = client.get_users(usernames='redlettermedia').data[0] # twitter account of one of my favorite YouTube channels, Red Letter Media
following = client.get_users_following(root_user['id'], max_results=1000).data
users = [(user['username'], user['id']) for user in following]
users.append((root_user['username'], root_user['id']))

# get tweets for each user id
for i, (username, user_id) in enumerate(users):
  print(f'Retrieving @{username}\'s tweets ({i+1}/{len(users)})............', end='\r')
  users_tweets = []
  until_id = None
  repeat_count = 0
  while repeat_count < 50:
    try:
      users_tweets = client.get_users_tweets(
        user_id,
        exclude=['retweets', 'replies'], 
        max_results=100,
        until_id=until_id,
      )
      if users_tweets[0]:
        until_id = users_tweets[0][-1]['id']
        tweets += list(map(lambda x: x['text'], users_tweets[0]))
        repeat_count += 1
      else:
        break
    except:
      print(f'Waiting for cooldown to end ({i+1}/{len(users)})............', end='\r')
      time.sleep(30)

np.save('tweets.npy', tweets)
print(f'\nTotal tweets retrieved: {len(tweets)}')

Retrieving @redlettermedia's tweets (615/615).............
Total tweets retrieved: 299759


Print subset of tweets

In [15]:
def print_tweets(tweets, indices):
    for tweet in np.array(tweets)[indices]:
        if len(tweet) > 0:
            print(tweet)
            print()

random_indices = list(np.random.choice(len(tweets), size=10, replace=False))

print_tweets(tweets, random_indices)

How many do you remember?https://t.co/mp8Cvwgh94

Alright which one of you degenerates is trying to log into my Fortnite account?

It's #MogwaiMonday! RT this if you wish you got a Gizmo for Xmas!!

#GREMLINS #Forever #gizmowish #gremlins3 ? https://t.co/qFQrCuz6sq

https://t.co/phfWHXxDze

My mom's face when I introduced her to #TheBoysTV https://t.co/6YOjJ1NgUS

Grindcore: Mad at Zimmer
Galaxy Gate: Mad at Rick
Castles in the Sky: Mad at Kirk
Furious: Mad at Everybody

Mixed Messages is now available to all Surly markets. https://t.co/RsE0hvoLRB

I notice they didn’t put ass on the list…😉 https://t.co/7iXFyJDSLW

Alright it's time, I'm finally going to learn clip studio paint.  Anyone recommend any tutorials they've used?

https://t.co/Ts2S6UvQlq https://t.co/E7cjLI9mWi

Boom goes the dynamite. Big reveal, @RickandMorty fans. I’ll be appearing at @IndyPopCon on Saturday, June 8th in Indianapolis.  Let’s hang out!
#popcon #popconindy #me #rickandmorty #indysquanchcon #picklerick #lemo

Clean up tweets for n-gram model

In [17]:
def clean_tweet(tweet):
  tweet = remove_links(tweet)
  tweet = html.unescape(tweet)
  tweet = tweet.replace('…', '...') # replace ellipsis symbol
  tweet = re.sub(r'[’‘]', '\'', tweet) # replace nonstandard single quotes
  tweet = re.sub(r'[\{\}\[\]\(\)"“”]', '', tweet) # remove brackets, parenthesis, and quotes
  tweet = re.sub(r'([^\w\s@#_/\'-])', r' \1 ', tweet) # separate punctuation
  tweet = re.sub(r'\s+', ' ', tweet) # replace excessive whitespace
  tweet = tweet.lower()
  return tweet

def remove_links(tweet):
  split_tweet = tweet.split()
  split_tweet = [split_str for split_str in split_tweet if not is_link(split_str)]
  tweet = ' '.join(split_tweet)
  return tweet

def is_link(string):
  return 'https://' in string or 'http://' in string

cleaned_tweets = [clean_tweet(tweet) for tweet in tweets]
print_tweets(cleaned_tweets, random_indices)

cleaned_tweets = list(set([tweet for tweet in cleaned_tweets if len(tweet) > 0]))
np.save('cleaned_tweets.npy', cleaned_tweets)
print(f'----\n\nTotal tweets after cleaning: {len(cleaned_tweets)}')

how many do you

alright which one of you degenerates is trying to log into my fortnite account ? 

it's #mogwaimonday ! rt this if you wish you got a gizmo for xmas ! ! #gremlins #forever #gizmowish #gremlins3 ? 

my mom's face when i introduced her to #theboystv

grindcore : mad at zimmer galaxy gate : mad at rick castles in the sky : mad at kirk furious : mad at everybody mixed messages is now available to all surly markets . 

i notice they didn't put ass on the list . . . 😉 

alright it's time , i'm finally going to learn clip studio paint . anyone recommend any tutorials they've used ? 

boom goes the dynamite . big reveal , @rickandmorty fans . i'll be appearing at @indypopcon on saturday , june 8th in indianapolis . let's hang out ! #popcon #popconindy #me #rickandmorty #indysquanchcon #picklerick #lemongrab #troversavestheuniverse

----

Total tweets after cleaning: 280388


# Create N-gram Model

$P(token_i \mid token_{i-n+1:i-1})=\dfrac{Count(token_{i-n+1:i})}{Count(token_{i-n+1:i-1})}$

In [18]:
START = '<SOT>'
STOP = '<EOT>'

class TweetModel:
  def __init__(self, tweets, n):
    self.n = n
    self.counts = Counter()
    self.context_counts = Counter()
    self.process_tweets(tweets)

  def process_tweets(self, tweets):
    tokenized_tweets = [tweet.split() for tweet in tweets]
    for tweet in tokenized_tweets:
      if len(tweet) > 0:
        tweet_ngrams = self.create_ngrams(tweet)
        for ngram in tweet_ngrams:
          self.counts[ngram] += 1
    for (context, _), count in self.counts.items():
      self.context_counts[context] += count

  def random_tweet(self, format=True):
    start_context = [START] * (self.n - 1)
    curr_context = start_context
    tweet, probs = [], []
    probs = []
    while True:
      token, prob = self.random_token(tuple(curr_context))
      if token == STOP:
        break
      tweet.append(token)
      probs.append(prob)
      curr_context.pop(0)
      curr_context.append(token)
      if len(' '.join(tweet)) > 280:
        curr_context = start_context
        tweet, probs = [], []
    if format:
      tweet = ' '.join(tweet) + ' '
      tweet = re.sub(r'(\S) ([\?!:;,\.])', r'\1\2', tweet) # line needs to be repeated twice to work properly
      tweet = re.sub(r'(\S) ([\?!:;,\.])', r'\1\2', tweet)
      tweet.strip()
    return tweet, probs

  def random_token(self, context):
    tokens = [token for (context1, token), _ in self.counts.items() if context == context1]
    rand = random.random()
    random_token = None
    total = 0
    for token in tokens:
      random_token = token
      total += self.counts[(context, token)] / self.context_counts[context]
      if total > rand:
        break
    return random_token, self.counts[(context, random_token)] / self.context_counts[context]

  def create_ngrams(self, tokens):
    ngrams = []
    for i in range(len(tokens) + 1):
        context = []
        for j in range(self.n - 1, 0, -1):
            if i - j < 0:
                context.append(START)
            else:
                context.append(tokens[i - j])
        if i == len(tokens):
            ngrams.append((tuple(context), STOP))
        else:
            ngrams.append((tuple(context), tokens[i]))
    return tuple(ngrams)

  def print_counts(self, n):
    print(f'TOTAL UNIQUE NGRAMS: {len(self.counts)}')
    print(f'TOP {n} NGRAMS:')
    counts = self.counts.items()
    counts = sorted(counts, key=lambda x: x[1], reverse=True)
    for count in counts[:n]:
      print(count)

Trigram model:

$P(token_i \mid token_{i-2:i-1})=\dfrac{Count(token_{i-2:i})}{Count(token_{i-2:i-1})}$

e.g. $P(\text{is}\mid\text{my name})=\dfrac{Count(\text{my name is})}{Count(\text{my name})}$

In [19]:
tweets = np.load('cleaned_tweets.npy')

model = TweetModel(tweets=tweets, n=3)
model.print_counts(20)

TOTAL UNIQUE NGRAMS: 3610235
TOP 20 NGRAMS:
((('.', '.'), '.'), 32131)
((('<SOT>', '<SOT>'), 'the'), 13361)
((('<SOT>', '<SOT>'), 'i'), 12705)
((('.', '.'), '<EOT>'), 10179)
((('!', '!'), '!'), 9405)
((('<SOT>', '<SOT>'), 'this'), 6856)
((('!', '!'), '<EOT>'), 4602)
((('<SOT>', '<SOT>'), "it's"), 4404)
((('<SOT>', '<SOT>'), 'we'), 4089)
((('<SOT>', '<SOT>'), 'a'), 3938)
((('<SOT>', '<SOT>'), '.'), 3608)
((('<SOT>', '<SOT>'), 'happy'), 3548)
((('<SOT>', '<SOT>'), 'if'), 3261)
((('<SOT>', '<SOT>'), 'my'), 3054)
((('<SOT>', '<SOT>'), "i'm"), 2788)
((('<SOT>', 'this'), 'is'), 2578)
((('<SOT>', '<SOT>'), 'just'), 2455)
((('<SOT>', '<SOT>'), 'you'), 2387)
((('<SOT>', '<SOT>'), 'what'), 2350)
((('<SOT>', '<SOT>'), 'new'), 2312)


# Generate tweets

In [20]:
def print_tweet(tweet):
  print('+------------------------------------------------------------+')
  print('| +-------+                                                  |')
  print('| |  ! !  | N-Gram Bot @ngrambot - 12h                       |')
  print('| | [O_O] |                                                  |')
  print('| |  | |  |                                                  |')
  print('| +-------+                                                  |')
  print('|                                                            |')

  for line in tweet_to_lines(tweet):
    padding = ' ' * (59 - len(line))
    print('| ' + line + padding + '|')

  print('|                                                            |')
  print('| <3 31.4k                                                   |')
  print('+------------------------------------------------------------+')

def tweet_to_lines(tweet):
  lines = []
  curr_line = ''
  for token in tweet.split():
    if len(curr_line) + len(token) + 1 <= 59:
      curr_line += token + ' '
    else:
      lines.append(curr_line)
      curr_line = token + ' '
  lines.append(curr_line)
  return lines

for i in range(10):
  random_tweet, _ = model.random_tweet()
  print_tweet(random_tweet)

+------------------------------------------------------------+
| +-------+                                                  |
| |  ! !  | N-Gram Bot @ngrambot - 12h                       |
| | [O_O] |                                                  |
| |  | |  |                                                  |
| +-------+                                                  |
|                                                            |
| the academy awards.....                                    |
|                                                            |
| <3 31.4k                                                   |
+------------------------------------------------------------+
+------------------------------------------------------------+
| +-------+                                                  |
| |  ! !  | N-Gram Bot @ngrambot - 12h                       |
| | [O_O] |                                                  |
| |  | |  |                                            

Display token probabilities for randomly generated tweet

In [24]:
def token_color(token_prob):
  if token_prob >= 0.75:
    return 'green'
  elif token_prob >= 0.5:
    return 'cyan'
  elif token_prob >= 0.25:
    return 'yellow'
  else:
    return 'red'

random_tweet, probs = model.random_tweet(format=False)

print('Legend')
print(colored('High Likelihood [75%, 100%]', token_color(0.75)))
print(colored('Medium-High Likelihood [50%, 75%)', token_color(0.5)))
print(colored('Low-Medium Likelihood [25%, 50%)', token_color(0.25)))
print(colored('Low Likelihood [0%, 25%)', token_color(0.0)))
print()

for token, prob in zip(random_tweet, probs):
  print(colored(token, token_color(prob)), end=' ')

Legend
[32mHigh Likelihood [75%, 100%][0m
[36mMedium-High Likelihood [50%, 75%)[0m
[33mLow-Medium Likelihood [25%, 50%)[0m
[31mLow Likelihood [0%, 25%)[0m

[31mwilly[0m [36mwonka[0m [33mand[0m [32mthe[0m [31mcontext[0m [31m;[0m [32mthe[0m [31mbeatles[0m [31m.[0m [31mbut[0m [31mthis[0m [31mwas[0m [31mone[0m [32mof[0m [33mthe[0m [31mdiary[0m [36mof[0m [32ma[0m [31msignificant[0m [31mrole[0m [32min[0m [31msteven[0m [32mspielberg's[0m [32m#westsidestory[0m [31mtimeless[0m [32m.[0m 