### Normalization functions ⛏

In [None]:
def fix_text(text):
    text = text.replace('&amp;', '&')
    text = text.replace('&lt;', '<')
    text = text.replace('&gt;', '>')
    return text
    
ALLOW_NEW_LINES = False 
def clean_tweet(tweet, allow_new_lines = ALLOW_NEW_LINES):
        bad_start = ['http:', 'https:']
        for w in bad_start:
            tweet = re.sub(f" {w}\\S+", "", tweet)      # removes white space before url
            tweet = re.sub(f"{w}\\S+ ", "", tweet)      # in case a tweet starts with a url
            tweet = re.sub(f"\n{w}\\S+ ", "", tweet)    # in case the url is on a new line
            tweet = re.sub(f"\n{w}\\S+", "", tweet)     # in case the url is alone on a new line
            tweet = re.sub(f"{w}\\S+", "", tweet)       # any other case?
        tweet = re.sub(' +', ' ', tweet)                # replace multiple spaces with one space
        if not allow_new_lines:                         # TODO: predictions seem better without new lines
            tweet = ' '.join(tweet.split())
        return tweet.strip()

def boring_tweet(tweet):
      "Check if this is a boring tweet"
      boring_stuff = ['http', '@', '#']
      not_boring_words = len([None for w in tweet.split() if all(bs not in w.lower() for bs in boring_stuff)])
      return not_boring_words < 3

### Dataset maker 🔥

In [None]:
import json,re,urllib3,random

print("Enter acount\'s username without @ sign ❌\n")
handle=input().lower()

cool_tweets = []
handles_processed = []
raw_tweets = []
user_names = []
n_tweets_dl = []
n_retweets = []
n_short_tweets = []
n_tweets_kept = []
# clear_output(wait=True)
http = urllib3.PoolManager(retries=urllib3.Retry(3))
res = http.request("GET", f"http://us-central1-huggingtweets.cloudfunctions.net/get_tweets?handle={handle}&force=1")
res = json.loads(res.data.decode('utf-8'))

user_names.append(res['user_name'])
all_tweets = res['tweets']
raw_tweets.append(all_tweets)
curated_tweets = [fix_text(tweet) for tweet in all_tweets]

# create dataset
clean_tweets = [clean_tweet(tweet) for tweet in curated_tweets]
cool_tweets.append([tweet for tweet in clean_tweets if not boring_tweet(tweet)])

# save count
n_tweets_dl.append(str(res['n_tweets']))#number of total tweets
n_retweets.append(str(res['n_RT']))#number of re-tweets
n_short_tweets.append(str(len(all_tweets) - len(cool_tweets[-1]))) #number of short tweets
n_tweets_kept.append(str(len(cool_tweets[-1])))#

print(f"\n{n_tweets_dl[-1]} tweets downloaded, including {n_retweets[-1]} Re-tweets and {n_short_tweets[-1]} short tweets\nSaving {n_tweets_kept[-1]} tweets in 💥data_{handle}_train.txt💥 \n")

if len('<|endoftext|>'.join(cool_tweets[-1])) < 6000:
  raise ValueError(f"Error: this user does not have enough tweets to train a Neural Network\n{res['n_tweets']} tweets downloaded, including {res['n_RT']} RT's and {len(all_tweets) - len(cool_tweets)} boring tweets... only {len(cool_tweets)} tweets kept!")

seed_data = random.randint(0,2**32-1)
dataRandom = random.Random(seed_data)
total_text = '<|endoftext|>'
all_handle_tweets = []
epoch_len = max(len(''.join(cool_tweet)) for cool_tweet in cool_tweets)
EPOCHS = 4

for _ in range(EPOCHS):
    for cool_tweet in cool_tweets:
        dataRandom.shuffle(cool_tweet)
        current_tweet = cool_tweet
        current_len = len(''.join(current_tweet))
        while current_len < epoch_len:
            for t in cool_tweet:
                current_tweet.append(t)
                current_len += len(t)
                if current_len >= epoch_len: break
        dataRandom.shuffle(current_tweet)
        all_handle_tweets.extend(current_tweet)
total_text += '<|endoftext|>'.join(all_handle_tweets) + '<|endoftext|>'

with open(f"data_{handle}_train.txt", 'w') as f:
    f.write(total_text)

Enter acount's username without @ sign ❌

BarackObama

3250 tweets downloaded, including 330 Re-tweets and 20 short tweets
Saving 2900 tweets in 💥data_barackobama_train.txt💥 



### አሰልጥን ‍🏋🏾🤸🏾🚴🏾🏃🏾


In [None]:
# Have global access
trainer = None
model_preview, token, namespace = None, None, None

# HYPER-PARAMETERS
ALLOW_NEW_LINES = False     # seems to work better
LEARNING_RATE = 1.372e-4
EPOCHS = 4

# transformers imports 
!pip install transformers
import transformers,pathlib
from transformers import (
AutoTokenizer, AutoModelForCausalLM,
TextDataset, DataCollatorForLanguageModeling,
Trainer, TrainingArguments,
get_cosine_schedule_with_warmup)

try: 
# Setting up pre-trained neural network
    global trainer
    tokenizer = AutoTokenizer.from_pretrained('gpt2')
    model = AutoModelForCausalLM.from_pretrained('gpt2', cache_dir=pathlib.Path('cache').resolve())
    block_size = tokenizer.model_max_length
    train_dataset = TextDataset(tokenizer=tokenizer, file_path=f"data_{handle}_train.txt", block_size=block_size, overwrite_cache=True)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    seed = random.randint(0,2**32-1)
    training_args = TrainingArguments(
    output_dir=f"output/{handle}",
    overwrite_output_dir=True,
    do_train=True,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    prediction_loss_only=True,
    logging_steps=5,
    save_steps=0,
    seed=seed,
    learning_rate = LEARNING_RATE)

    # Set-up Trainer
    trainer = Trainer(
          model=model,
          tokenizer=tokenizer,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset)
    
    # Update lr scheduler
    train_dataloader = trainer.get_train_dataloader()
    num_train_steps = len(train_dataloader)
    trainer.create_optimizer_and_scheduler(num_train_steps)
    trainer.lr_scheduler = get_cosine_schedule_with_warmup(
        trainer.optimizer,
        num_warmup_steps=0,
        num_training_steps=num_train_steps)
    
    #Train
    trainer.train()

     # set model config parameters
    trainer.model.config.task_specific_params['text-generation'] = {
        'do_sample': True,
        'min_length': 10,
        'max_length': 160,
        'temperature': 1.,
        'top_p': 0.95,
        'prefix': '<|endoftext|>'}
   
    # save new model files
    model_name='Tomasmodel'
    trainer.save_model(model_name)

except Exception as e:
                print('\nAn error occured...\n')
                print(e)
                            

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": 

Step,Training Loss
5,3.5543
10,3.1356
15,3.0963
20,3.0622
25,2.9928
30,2.9523
35,2.8827
40,2.8228
45,2.8078
50,2.8235




Training completed. Do not forget to share your model on huggingface.co/models =)





An error occured...

name 'model_name' is not defined


In [None]:
# save new model files
model_name='Tomasmodel'
trainer.save_model(model_name)

Saving model checkpoint to Tomasmodel
Configuration saved in Tomasmodel/config.json
Model weights saved in Tomasmodel/pytorch_model.bin
tokenizer config file saved in Tomasmodel/tokenizer_config.json
Special tokens file saved in Tomasmodel/special_tokens_map.json


### ተንብይ🎯


In [None]:
start=input()
# prepare input
start_with_bos = '<|endoftext|>' + start
encoded_prompt = trainer.tokenizer(start_with_bos, add_special_tokens=False, return_tensors="pt").input_ids
encoded_prompt = encoded_prompt.to(trainer.model.device)

# prediction
output_sequences = trainer.model.generate(
    input_ids=encoded_prompt,
    max_length=160,
    min_length=10,
    temperature=1.,
    top_p=0.95,
    do_sample=True,
    num_return_sequences=10
    )


generated_sequences = []

# decode prediction
for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
    generated_sequence = generated_sequence.tolist()
    text = trainer.tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True, skip_special_tokens=True)
    if not ALLOW_NEW_LINES:
        limit = text.find('\n')
        text = text[: limit if limit != -1 else None]
    generated_sequences.append(text.strip())

# print(generated_sequences)
print(*generated_sequences, sep = "\n")

Facebook is


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Facebook is the next big step forward in the fight against climate change. Follow the @ObamaFoundation on the progress: #ActOnClimate
Facebook is showing how much it’s been pushing for this election. #TurnaroundDay
Facebook is proud to endorse progressive leaders. Join them.
Facebook is no longer a partisan issue—but a source of great joy to me is when folks like Jackie Robinson and Bruce @Kubrick’s voices speak out on a basic human right: voting.
Facebook is proud to support young people in their communities who have the courage to demand action and say it's time to tackle climate change.
Facebook is one of the oldest media platforms. It's where everybody can tune in to watch the President talk about the economy and solve the world's climate crisis. Here is a look at how it helped Americans get organized, organize and #StopGunViolence.
Facebook is becoming the internet's favorite social network. And while the digital movement has taken off, the conversation on social media has more th