In [None]:
import torch
import json
import urllib3
import pathlib
import shutil
import requests
import os
import re
import random
import code


ALLOW_NEW_LINES =  False

# ==== The following functons are from [@borisdayma](https://twitter.com/borisdayma)'s HuggingTweets demo, with modifications.
def fix_text(text):
    text = text.replace('&amp;', '&')
    text = text.replace('&lt;', '<')
    text = text.replace('&gt;', '>')
    return text

def clean_tweet(tweet, allow_new_lines = ALLOW_NEW_LINES):
    bad_start = ['http:', 'https:']
    for w in bad_start:
        tweet = re.sub(f" {w}\\S+", "", tweet)      # removes white space before url
        tweet = re.sub(f"{w}\\S+ ", "", tweet)      # in case a tweet starts with a url
        tweet = re.sub(f"\n{w}\\S+ ", "", tweet)    # in case the url is on a new line
        tweet = re.sub(f"\n{w}\\S+", "", tweet)     # in case the url is alone on a new line
        tweet = re.sub(f"{w}\\S+", "", tweet)       # any other case?
    tweet = re.sub(' +', ' ', tweet)                # replace multiple spaces with one space
    if not allow_new_lines:                         # TODO: predictions seem better without new lines
        tweet = ' '.join(tweet.split())
    return tweet.strip()
    
def boring_tweet(tweet):
    "Check if this is a boring tweet"
    boring_stuff = ['http', '@', '#']
    not_boring_words = len([None for w in tweet.split() if all(bs not in w.lower() for bs in boring_stuff)])
    # return not_boring_words < 3  # Original
    return not_boring_words < 10 or '@' in tweet or 'http:' in tweet or 'https:' in tweet # Modified

# Max tweet length: 280
# handle = 'FINALLEVEL' # 3206 tweets, 1522 final
# handle= "tszzl" # 952 final. Some are long tweets.
# handle = 'ThomasMiconi' # 2340 tweets, 506 final
# handle = 'dril' # 3199 tweets, 1718 final
handle = 'karpathy' # 3242 tweets, 1096 final, mean length 165 (without selecting on length<284: 1254 final, mean length 175)
# handle = 'realdonaldtrump' # 3165 tweets, 904 final, mean length 176
# handle = 'cher' #3197 tweets, 536 final... meanlength  193
print(f'\nDownloading @{handle} tweets... This should take no more than a minute!')
http = urllib3.PoolManager(retries=urllib3.Retry(3))
res = http.request("GET", f"http://us-central1-huggingtweets.cloudfunctions.net/get_tweets?handle={handle}&force=1")
res = json.loads(res.data.decode('utf-8'))

user_name = res['user_name']
all_tweets = res['tweets']

raw_tweets  = all_tweets
curated_tweets = [tweet for tweet in raw_tweets if len(tweet) < 284]
fixed_tweets = [fix_text(tweet) for tweet in curated_tweets]
print(f"\n{res['n_tweets']} tweets from @{handle} downloaded!\n\n")

# create dataset
clean_tweets = [clean_tweet(tweet) for tweet in fixed_tweets]
cool_tweets  = [tweet for tweet in clean_tweets if not boring_tweet(tweet)]





Downloading @karpathy tweets... This should take no more than a minute!

3242 tweets from @karpathy downloaded!




In [None]:
print(res.keys())
print(len(cool_tweets), "tweets available after curation and filtering")
tweet_lengths = [len(x) for x in cool_tweets]
print("Min / mean / max tweet length:", min(tweet_lengths), sum(tweet_lengths) / len(tweet_lengths), max(tweet_lengths))


dict_keys(['tweets', 'n_tweets', 'n_RT', 'n_kept', 'social_link', 'user_name', 'user_profile', 'wandb'])
1096 tweets available after curation and filtering
Min / mean / max tweet length: 46 165.0 280


In [None]:
for tweet  in cool_tweets[2:22]:
    print(tweet)
    print("==")

Wow, very nice "full-stack" release (again!) Allows finetuning of models as strong as LLaMA-65B on a single GPU as small as 48GB, in hours.
==
[New Talk] Pleasure to come by Microsoft BUILD this year and give a talk on "State of GPT". Goes through the GPT Assistant training pipeline, covers some "LLM Psychology", and offers a few best practices:
==
Someone has to redo that meme with the statistician vs deep learning “stack more layers” clown because the picture is shifting by one
==
Overheard: “People who know nothing about machine learning are now paradoxically advantaged in LLMs because they don’t immediately reach for overly sophisticated ideas and spend a lot more time hacking prompts” When hacking prompts feels below your dignity but it works :’|
==
Also highly relevant: guidance from microsoft "Guidance programs allow you to interleave generation, prompting, and logical control" Also internally handles subtle but important tokenization-related issues, e.g. "token healing".
==
Pro

In [None]:
with open("tweets_"+str(handle)+".txt",'w',encoding='utf8') as f:
    for tweet in cool_tweets:
        f.write(tweet+"\n")

