# Imports

In [1]:
%pip install tweepy --upgrade

import tweepy
import random
import configparser
import numpy as np
import re
import time
from collections import Counter

Note: you may need to restart the kernel to use updated packages.


# Download and Clean Tweets

Load tweepy client and read api keys

In [2]:
config = configparser.ConfigParser()
config.read('config.ini')
tokens = config['Tokens']
client = tweepy.Client(
  bearer_token=tokens['BearerToken'],
  consumer_key=tokens['ConsumerKey'],
  consumer_secret=tokens['ConsumerSecret'],
  access_token=tokens['AccessToken'],
  access_token_secret=tokens['AccessTokenSecret'],
)

Download tweets of followed users

In [3]:
tweets = []

# get user ids of followed twitter accounts
me_id = client.get_users(usernames='pauljscottiv').data[0]['id']
following = client.get_users_following(me_id, max_results=1000).data
following = [(user['username'], user['id']) for user in following]

# get tweets for each user id
for i, (username, user_id) in enumerate(following):
  print(f'Retrieving @{username}\'s tweets ({i+1}/{len(following)})........', end='\r')
  users_tweets = []
  until_id = None
  repeat_count = 0
  while repeat_count < 50:
    try:
      users_tweets = client.get_users_tweets(
        user_id,
        exclude=['retweets', 'replies'], 
        max_results=100,
        until_id=until_id,
      )
      if users_tweets[0]:
        until_id = users_tweets[0][-1]['id']
        tweets += list(map(lambda x: x['text'], users_tweets[0]))
        repeat_count += 1
      else:
        break
    except:
      print(f'TooManyRequests Exception: Waiting for cooldown to end ({i+1}/{len(following)})...', end='\r')
      time.sleep(60)

print(f'\nTotal tweets retrieved: {len(tweets)}')

Retrieving @scottpj3's tweets (360/360)..................

Print subset of tweets

In [10]:
def print_tweets(tweets, indices):
    for tweet in np.array(tweets)[indices]:
        if len(tweet) > 0:
            print(tweet)
            print()

random_indices = list(np.random.choice(len(tweets), size=10, replace=False))

print_tweets(tweets, random_indices)

Educators, I'm launching in less than 10 days. Start the countdown with this #TeachableMoment. Have students do some of the math my team uses to launch me to Mars or to look for seismic waves. These fun lessons might just inspire the next rocket scientist: https://t.co/tckiZTuLKM https://t.co/cs771vNoZ5

i just tested positive for sitting around

call my girl pulp fiction the way ion know what’s going on but i guess there’s a gimp suit

this banger https://t.co/FzlCPGDg9m https://t.co/YeZbvN9aYV

I’m praying for our health care workers who are on the frontlines fighting the recent surge in COVID-19 cases across Georgia. 

We owe it to them to do our part by getting vaccinated and wearing masks so we can finally get this virus under control. 
https://t.co/eleUEZtbQH

ok https://t.co/CAAQGbJ1hE

It was an honor to collaborate with Cookie Monster, one of our nation’s finest journalists: http://t.co/yooR2P11aE

Taking Pets Like No Problem. https://t.co/UHzqS77o7K

BREAKING: Mass arrests to

Save tweets

Clean up tweets for n-gram model

In [11]:
def clean_tweet(tweet):
  tweet = remove_links(tweet)
  tweet = re.sub(r'[\{\}\[\]\(\)"“”]', '', tweet)
  tweet = tweet.replace('’', "'").replace('…', '...').replace('&amp;', '&')
  tweet = tweet.lower()
  tweet = re.sub(r'([^\w\s@#_/\'-])', r' \1 ', tweet)
  tweet = re.sub(r'\s+', ' ', tweet)
  return tweet

def remove_links(tweet):
  split_tweet = tweet.split()
  split_tweet = [split_str for split_str in split_tweet if split_str if 'https://' not in split_str]
  tweet = ' '.join(split_tweet)
  return tweet

cleaned_tweets = [clean_tweet(tweet) for tweet in tweets]
np.save('tweets.npy', cleaned_tweets)

print_tweets(cleaned_tweets, random_indices)

educators , i'm launching in less than 10 days . start the countdown with this #teachablemoment . have students do some of the math my team uses to launch me to mars or to look for seismic waves . these fun lessons might just inspire the next rocket scientist : 

i just tested positive for sitting around

call my girl pulp fiction the way ion know what's going on but i guess there's a gimp suit

this banger

i'm praying for our health care workers who are on the frontlines fighting the recent surge in covid-19 cases across georgia . we owe it to them to do our part by getting vaccinated and wearing masks so we can finally get this virus under control . 

ok

it was an honor to collaborate with cookie monster , one of our nation's finest journalists : http : //t . co/yoor2p11ae

taking pets like no problem . 

breaking : mass arrests today at the bureau of indian affairs in d . c . during the first occupation of the building in 50 yrs . biden admin has utterly failed to protect ancestra

# Create N-gram Model

$P(token_i \mid token_{i-n+1:i-1})=\dfrac{Count(token_{i-n+1:i})}{Count(token_{i-n+1:i-1})}$

In [2]:
START = '<SOT>'
STOP = '<EOT>'

class TweetModel:
  def __init__(self, tweets, n):
    self.n = n
    self.counts = Counter()
    self.context_counts = Counter()
    self.process_tweets(tweets)

  def process_tweets(self, tweets):
    tokenized_tweets = [tweet.split() for tweet in tweets]
    for tweet in tokenized_tweets:
      if len(tweet) > 0:
        tweet_ngrams = self.create_ngrams(tweet)
        for ngram in tweet_ngrams:
          self.counts[ngram] += 1
    for (context, _), count in self.counts.items():
      self.context_counts[context] += count

  def random_tweet(self):
    start_context = [START] * (self.n - 1)
    curr_context = start_context
    tweet = []
    while True:
      token = self.random_token(tuple(curr_context))
      if token == STOP:
        break
      if len(' '.join(tweet)) > 280:
        curr_context = start_context
      tweet.append(token)
      curr_context.pop(0)
      curr_context.append(token)
    tweet = ' '.join(tweet) + ' '
    tweet = re.sub(r'(\S) ([\?!:;,\.])', r'\1\2', tweet)
    tweet = re.sub(r'(\S) ([\?!:;,\.])', r'\1\2', tweet)
    return tweet.strip()

  def random_token(self, context):
    tokens = [token for (context1, token), _ in self.counts.items() if context == context1]
    rand = random.random()
    random_token = None
    total = 0
    for token in tokens:
      random_token = token
      total += self.counts[(context, token)] / self.context_counts[context]
      if total > rand:
        break
    return random_token

  def create_ngrams(self, tokens):
    ngrams = []
    for i in range(len(tokens) + 1):
        context = []
        for j in range(self.n - 1, 0, -1):
            if i - j < 0:
                context.append(START)
            else:
                context.append(tokens[i - j])
        if i == len(tokens):
            ngrams.append((tuple(context), STOP))
        else:
            ngrams.append((tuple(context), tokens[i]))
    return tuple(ngrams)

  def print_counts(self, n):
    print(f'TOTAL UNIQUE NGRAMS: {len(self.counts)}')
    print(f'TOP {n} NGRAMS:')
    counts = self.counts.items()
    counts = sorted(counts, key=lambda x: x[1], reverse=True)
    for count in counts[:n]:
      print(count)

Trigram model:

$P(token_i \mid token_{i-2:i-1})=\dfrac{Count(token_{i-2:i})}{Count(token_{i-2:i-1})}$

e.g. $P(\text{is}\mid\text{my name})=\dfrac{Count(\text{my name is})}{Count(\text{my name})}$

In [3]:
tweets = np.load('tweets.npy')

model = TweetModel(tweets=tweets, n=3)
model.print_counts(20)

TOTAL UNIQUE NGRAMS: 2039273
TOP 20 NGRAMS:
((('.', '.'), '.'), 10856)
((('<SOT>', '<SOT>'), 'i'), 9433)
((('<SOT>', '<SOT>'), 'the'), 5896)
((('!', '!'), '!'), 5651)
((('▓', '▓'), '▓'), 3980)
((('<SOT>', '<SOT>'), 'this'), 3849)
((('░', '░'), '░'), 3774)
((('http', ':'), '//t'), 2997)
(((':', '//t'), '.'), 2997)
((('.', '.'), '<EOT>'), 2375)
((('<SOT>', '<SOT>'), 'if'), 2293)
((('<SOT>', '<SOT>'), "i'm"), 2198)
((('<SOT>', '<SOT>'), 'my'), 1967)
((('<SOT>', '<SOT>'), 'a'), 1870)
((('!', '!'), '<EOT>'), 1857)
((('<SOT>', '<SOT>'), 'we'), 1724)
((('<SOT>', '<SOT>'), "it's"), 1695)
((('<SOT>', '<SOT>'), 'new'), 1598)
((('<SOT>', 'this'), 'is'), 1521)
((('<SOT>', '<SOT>'), 'just'), 1363)


# Generate tweets

In [9]:
def print_tweet(tweet):
  print('+------------------------------------------------------------+')
  print('| +-------+                                                  |')
  print('| |  ! !  | N-Gram Bot @ngrambot - 12h                       |')
  print('| | [O_O] |                                                  |')
  print('| |  | |  |                                                  |')
  print('| +-------+                                                  |')
  print('|                                                            |')

  for line in tweet_to_lines(tweet):
    padding = ' ' * (59 - len(line))
    print('| ' + line + padding + '|')

  print('|                                                            |')
  print('| <3 31.4k                                                   |')
  print('+------------------------------------------------------------+')

def tweet_to_lines(tweet):
  lines = []
  curr_line = ''
  for token in tweet.split():
    if len(curr_line) + len(token) + 1 <= 59:
      curr_line += token + ' '
    else:
      lines.append(curr_line)
      curr_line = token + ' '
  lines.append(curr_line)
  return lines

for i in range(10):
  print_tweet(model.random_tweet())

+------------------------------------------------------------+
| +-------+                                                  |
| |  ! !  | N-Gram Bot @ngrambot - 12h                       |
| | [O_O] |                                                  |
| |  | |  |                                                  |
| +-------+                                                  |
|                                                            |
| how do i lack patience and understanding. e awards! one is |
| asking why my daddy. their bias is hilarious when i cook   |
| dinners for her bathroom renovation. it no matter what     |
| please understand i had so many times before but i'll tell |
| you how to haunt them                                      |
|                                                            |
| <3 31.4k                                                   |
+------------------------------------------------------------+
+------------------------------------------------------