# Imports

In [1]:
%pip install tweepy --upgrade

import tweepy
import random
import configparser
import numpy as np
import re
from collections import Counter

Note: you may need to restart the kernel to use updated packages.


# Download and Clean Tweets

Load tweepy client and read api keys

In [2]:
config = configparser.ConfigParser()
config.read('config.ini')
tokens = config['Tokens']
client = tweepy.Client(
  bearer_token=tokens['BearerToken'],
  consumer_key=tokens['ConsumerKey'],
  consumer_secret=tokens['ConsumerSecret'],
  access_token=tokens['AccessToken'],
  access_token_secret=tokens['AccessTokenSecret'],
)

Download tweets of specified users

In [3]:
tweets = []
users = [
  'afraidofwasps', 'ameliaelizalde', 'Boringstein', 'boss_on_here',
  'darth_erogenous', 'dril', 'feufillet', 'heaberald', 'i_zzzzzz',
  'keffals', 'laserboat999', 'len0killer', 'Liv_Agar', 'lunch_enjoyer',
  'nibiru_TRUTH', 'OkButStill', 'oldfriend99', 'peterxinping',
  'PillsAreFood', 'pizza_jones', 'RadishHarmers', 'rajat_suresh',
  's4m31p4n', 'Senn_Spud', 'yesitsmyaccount', 'ZeroSuitCamus',
]

# get twitter ids from usernames
user_data = client.get_users(usernames=users)
user_ids = list(map(lambda x: x['id'], user_data[0]))

# get tweets for each user id
for user_id in user_ids:
  until_id = None
  for _ in range(8):
    users_tweets = client.get_users_tweets(
      user_id,
      exclude=['retweets', 'replies'], 
      max_results=100,
      until_id=until_id,
    )
    if users_tweets[0]:
      until_id = users_tweets[0][-1]['id']
      tweets += list(map(lambda x: x['text'], users_tweets[0]))

Print subset of tweets

In [12]:
def print_tweets(tweets, indices):
    for tweet in np.array(tweets)[indices]:
        if len(tweet) > 0:
            print(tweet)
            print()

random_indices = list(np.random.choice(len(tweets), size=10, replace=False))

print_tweets(tweets, random_indices)
print(f'-----\n\nTOTAL TWEETS: {len(tweets)}')

Why Does White Guy Always Scream “LETS GOOOO” Like Wtf You Need To Relax Friend We’re Just Playing Checkers.

Cool https://t.co/pGxNxsv92c

Asking the arresting officer if it’s over

The internet should be decentralized... (everyone applauds and I get more confident) It should also be 400,000 times slower and every single thing on it should be a speculative financial asset (everyone is cheering and building a throne for me)

you could still make clive owen james bond. i think it would be a nice little punishment for him.

I find these videos physically repulsive please stop sharing them https://t.co/66M3Gw21SN

It's cool how you said that one sentence, and then said that other one that was totally unrelated. It was like you were tapping into something deeper; I couldn't understand it at all. I want my computer to be window. I want to open my laptop and look at my living room carpet

April 15th, 1912. 2:20 am.

She Is Drunk Telling Everyone How Happy She Is Laughing And Smiling Around S

Save and load tweets

In [None]:
np.save('weird_tweets.npy', np.array(tweets))

In [None]:
tweets = list(np.load('weird_tweets.npy'))

Clean up tweets for n-gram model

In [85]:
def clean_tweet(tweet):
  tweet = remove_links(tweet)
  tweet = re.sub(r'[\{\}\[\]\(\)"“”]', '', tweet)
  tweet = tweet.replace('’', "'").replace('…', '...').replace('&amp;', '&')
  tweet = tweet.lower()
  tweet = re.sub(r'([^\w\s@#_/\'-])', r' \1 ', tweet)
  tweet = re.sub(r'\s+', ' ', tweet)
  return tweet

def remove_links(tweet):
  split_tweet = tweet.split()
  split_tweet = [split_str for split_str in split_tweet if split_str if 'https://' not in split_str]
  tweet = ' '.join(split_tweet)
  return tweet

def multi_replace(text, string_map):
  for old, new in string_map.items():
    text = text.replace(old, new)
  return text

cleaned_tweets = list(map(clean_tweet, tweets))

print_tweets(cleaned_tweets, random_indices)

why does white guy always scream lets goooo like wtf you need to relax friend we're just playing checkers . 

cool

asking the arresting officer if it's over

the internet should be decentralized . . . everyone applauds and i get more confident it should also be 400 , 000 times slower and every single thing on it should be a speculative financial asset everyone is cheering and building a throne for me

you could still make clive owen james bond . i think it would be a nice little punishment for him . 

i find these videos physically repulsive please stop sharing them

it's cool how you said that one sentence , and then said that other one that was totally unrelated . it was like you were tapping into something deeper ; i couldn't understand it at all . i want my computer to be window . i want to open my laptop and look at my living room carpet

april 15th , 1912 . 2 : 20 am . 

she is drunk telling everyone how happy she is laughing and smiling around strangers yet texting you has not 

# Create N-gram Model

$P(token_i \mid token_{i-n+1:i-1})=\dfrac{Count(token_{i-n+1:i})}{Count(token_{i-n+1:i-1})}$

In [87]:
START = '<s>'
STOP = '</s>'

class TweetModel:
  def __init__(self, tweets, n):
    self.n = n
    self.counts = Counter()
    self.context_counts = Counter()
    self.process_tweets(tweets)

  def process_tweets(self, tweets):
    tokenized_tweets = [tweet.split() for tweet in tweets]
    for tweet in tokenized_tweets:
      if len(tweet) > 0:
        tweet_ngrams = self.create_ngrams(tweet)
        for ngram in tweet_ngrams:
          self.counts[ngram] += 1
    for (context, _), count in self.counts.items():
      self.context_counts[context] += count

  def random_tweet(self):
    start_context = [START] * (self.n - 1)
    curr_context = start_context
    tweet = []
    while True:
      token = self.random_token(tuple(curr_context))
      if token == STOP:
        break
      if len(' '.join(tweet)) > 280:
        curr_context = start_context
      tweet.append(token)
      curr_context.pop(0)
      curr_context.append(token)
    tweet = ' '.join(tweet) + ' '
    tweet = re.sub(r'(\S) ([\?!:;,\.])', r'\1\2', tweet)
    tweet = re.sub(r'(\S) ([\?!:;,\.])', r'\1\2', tweet)
    return tweet.strip()

  def random_token(self, context):
    tokens = [token for (context1, token), _ in self.counts.items() if context == context1]
    rand = random.random()
    random_token = None
    total = 0
    for token in tokens:
      random_token = token
      total += self.counts[(context, token)] / self.context_counts[context]
      if total > rand:
        break
    return random_token

  def create_ngrams(self, tokens):
    ngrams = []
    for i in range(len(tokens) + 1):
        context = []
        for j in range(self.n - 1, 0, -1):
            if i - j < 0:
                context.append(START)
            else:
                context.append(tokens[i - j])
        if i == len(tokens):
            ngrams.append((tuple(context), STOP))
        else:
            ngrams.append((tuple(context), tokens[i]))
    return tuple(ngrams)

  def print_counts(self, n):
    print(f'TOTAL UNIQUE NGRAMS: {len(self.counts)}')
    print(f'TOP {n} NGRAMS:')
    counts = self.counts.items()
    counts = sorted(counts, key=lambda x: x[1], reverse=True)
    for count in counts[:n]:
      print(count)

Trigram model:

$P(token_i \mid token_{i-2:i-1})=\dfrac{Count(token_{i-2:i})}{Count(token_{i-2:i-1})}$

e.g. $P(\text{is}\mid\text{my name})=\dfrac{Count(\text{my name is})}{Count(\text{my name})}$

In [88]:
model = TweetModel(tweets=cleaned_tweets, n=3)
model.print_counts(20)

TOTAL UNIQUE NGRAMS: 233066
TOP 20 NGRAMS:
((('<s>', '<s>'), 'i'), 1389)
((('.', '.'), '.'), 996)
((('<s>', '<s>'), 'the'), 536)
((('!', '!'), '!'), 491)
((('<s>', '<s>'), 'my'), 360)
((('<s>', '<s>'), 'if'), 357)
((('<s>', '<s>'), 'this'), 304)
((('.', '.'), '</s>'), 279)
((('<s>', '<s>'), "i'm"), 277)
((('<s>', '<s>'), 'you'), 246)
((('<s>', '<s>'), "it's"), 183)
((('!', '!'), '</s>'), 179)
((('<s>', '<s>'), 'just'), 170)
((('😉', '😉'), '😉'), 166)
((('<s>', '<s>'), 'when'), 157)
((('<s>', '<s>'), 'oh'), 148)
((('<s>', '<s>'), 'me'), 143)
((('<s>', 'this'), 'is'), 139)
((('<s>', '<s>'), 'they'), 133)
((('<s>', '<s>'), 'im'), 130)


# Generate tweets

In [89]:
def print_tweet(tweet):
  print('+------------------------------------------------------------+')
  print('| +-------+                                                  |')
  print('| |  ! !  | N-Gram Bot @ngrambot - 12h                       |')
  print('| | [O_O] |                                                  |')
  print('| |  | |  |                                                  |')
  print('| +-------+                                                  |')
  print('|                                                            |')

  for line in tweet_to_lines(tweet):
    padding = ' ' * (59 - len(line))
    print('| ' + line + padding + '|')

  print('|                                                            |')
  print('| <3 31.4k                                                   |')
  print('+------------------------------------------------------------+')

def tweet_to_lines(tweet):
  lines = []
  curr_line = ''
  for token in tweet.split():
    if len(curr_line) + len(token) + 1 <= 59:
      curr_line += token + ' '
    else:
      lines.append(curr_line)
      curr_line = token + ' '
  lines.append(curr_line)
  return lines

In [90]:
for i in range(10):
  print_tweet(model.random_tweet())

+------------------------------------------------------------+
| +-------+                                                  |
| |  ! !  | N-Gram Bot @ngrambot - 12h                       |
| | [O_O] |                                                  |
| |  | |  |                                                  |
| +-------+                                                  |
|                                                            |
| pause 🤨 ✋ ⏸ ️                                              |
|                                                            |
| <3 31.4k                                                   |
+------------------------------------------------------------+
+------------------------------------------------------------+
| +-------+                                                  |
| |  ! !  | N-Gram Bot @ngrambot - 12h                       |
| | [O_O] |                                                  |
| |  | |  |                                            