# Data generator

#### This notebook generates and stores a synthetic dataset using k-shot prompting

- We reccomend the use of a GPU for larger datasets and models. 
- Throughout this notebook CAPITAL_LETTERS indicate that you must enter information (such as paths, sizes, etc.)

### Setup
- Install and import libraries
- Get prompts with K examples
- Load in the model

In [None]:
!pip install datasets
!pip install transformers[sentencepiece]
!pip install --upgrade transformers
!pip install einops
!pip install torch
!pip install huggingface_hub
!pip install sentencepiece

In [94]:
from datasets import load_dataset, load_from_disk, concatenate_datasets, Dataset, DatasetDict

import pandas as pd
import numpy as np

from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login

import os
import re
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [23]:
# Returns empty positive and negative prompts   
def resetPrompts():
    positivePrompt = '''Generate an enthusiastic and positive social media tweet. Your tweet should express praise or excitement. Use the following examples as a guide for formatting and tone:\n'''
    negativePrompt = '''Generate a disapproving and negative social media tweet. The tweet should convey criticism or disappointment. Use the following examples as a guide for formatting and tone:\n'''

    return positivePrompt, negativePrompt

In [40]:
# Returns prompts with k examples 
def addKExamples(k):

    #Get the empty prompts without examples
    fullPositivePrompt, fullNegativePrompt = resetPrompts()
    train = dataset['train']

    #Filter through the dataset and select the first k positive and negative datapoints
    positiveExamples = train.filter(lambda x: x['label'] == 0).select(range(k))["text"]
    negativeExamples = train.filter(lambda x: x['label'] == 1).select(range(k))["text"]
    
    for example in positiveExamples:
        fullPositivePrompt += f'Positive: "{example}"\n'
        
    for example in negativeExamples:
        fullNegativePrompt += f'Negative: "{example}"\n'

    # We store the prompt without the actual model prompting area to make filtering easier later. 
    examplesPositivePrompt = fullPositivePrompt
    examplesNegativePrompt = fullNegativePrompt
    
    fullPositivePrompt += 'Positive: "'
    fullNegativePrompt += 'Negative: "'
    
    return fullPositivePrompt, fullNegativePrompt, examplesPositivePrompt, examplesNegativePrompt

In [41]:
k = 5
fullPositivePrompt, fullNegativePrompt, examplesPositivePrompt, examplesNegativePrompt = addKExamples(k)

In [29]:
# Login is neccessary for gated models like Mistral-7B
login(YOUR_HUGGINGFACE_KEY)

# Enter the name of the Huggingface model you want to use. 
# We used "mistralai/Mistral-7B-Instruct-v0.3" and "microsoft/Phi-3-mini-4k-instruct"
model_name = MODEL_NAME

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

if(torch.cuda.is_available()):
  model = model.cuda()

if tokenizer.pad_token is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

### Model Config
- Set up the neccessary methods to generate and filter through tweets.

In [32]:
# Returns a generated tweet and its logits. 
def promptModel(prompt, length):
    inputs = tokenizer(prompt, return_tensors='pt').to(device)
    attention_mask = inputs['attention_mask'].to(device)

    outputs = model.generate(
        inputs['input_ids'],
        do_sample=True,
        attention_mask=attention_mask,
        num_return_sequences=1,
        max_length=length,
        temperature = 0.9, 
        top_p = 0.9, 
        repetition_penalty=1.2,
        output_scores=True,
        return_dict_in_generate=True,
        pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id
    )

    # Decode the generated text
    text = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)

    with torch.no_grad():
        # Forward pass to get logits
        logits = model(outputs.sequences).logits


    return text, logits

In [34]:
# Uses regex to filter and return only the tweet (extract the unwanted additional generation)
def findTweets(generated_text, isPositive, plainPrompt):
    output = generated_text.replace(plainPrompt, "").replace('“', '"').replace('”', '"')

    if(isPositive):
        positive_match = re.search(r'Positive:\s*"\s*([^"]*)\s*"', output, re.DOTALL)
        positive_tweet = positive_match.group(1).strip() if positive_match else "-1"
        return positive_tweet

    else:
        negative_match = re.search(r'Negative:\s*"\s*([^"]*)\s*"', output, re.DOTALL)
        negative_tweet = negative_match.group(1).strip() if negative_match else "-1"
        return negative_tweet

In [35]:
# Repeatedly prompts the model until it generates with correct formatting. 
def generateAndExtractTweets(prompt, label, length, plainPrompt):     
    while True:
        text, logits = promptModel(prompt, length)
        tweet = findTweets(text, label == 1, plainPrompt)
        
        if tweet != "-1" and len(tweet) > 0:
            length = len(tokenizer.encode(tweet, add_special_tokens=True))
            text, logits = promptModel(tweet, length + 1)  
            return text, logits

In [None]:
#Example
generateAndExtractTweets(fullNegativePrompt, 0, 1650, examplesNegativePrompt)

### Dataset generation

All of our tests have datasets with 15,000 training datapoints and 2,000 testing datapoints. 

Everything is formatted identical to sentiment140, meaning that we have features of "text" and "labels".

In [85]:
#Generates a synthetic dataset given its size and tokens per tweet. It prompts the model, the neccessary amount of times and arranges everything into a Dataset object. 

def generateDataset (size, tokensPerTweet):

    syntheticDataset = {
        'text': [],
        'label': []
    }

    #Generating half as positive datapoints with a label of 1.
    for i in range (size//2):
        text, logits = generateAndExtractTweets(fullPositivePrompt, 1, tokensPerTweet, examplesPositivePrompt)
        syntheticDataset["text"].append(text)
        syntheticDataset["label"].append(1)
        if(i % 20 == 0):
            print(i)

    print("FINISHED POSITIVE")

    #Generating half as negative datapoints with a label of 1.
    for i in range (size//2):
        text, logits = generateAndExtractTweets(fullNegativePrompt, 0, tokensPerTweet, examplesNegativePrompt)
        syntheticDataset["text"].append(text)
        syntheticDataset["label"].append(0)
        if(i % 20 == 0):
            print(i)
            
    print("FINISHED NEGATIVE")

    syntheticDataset = Dataset.from_dict(syntheticDataset).shuffle()

    return syntheticDataset

As the prompt gets larger with more examples, we must also increase the alloted tweet size. 
- k = 5 uses a size value of 350 tokens
- k = 10 uses a size value of 500 tokens
- k = 15 uses a size value of 650 tokens
- k = 30 uses a size value of 1150 tokens
- k = 50 uses a size value of 1650 tokens

In [None]:
# Generate a full synthetic dataset. 

# Our dataset used 15k for trainSize, 2k for testSize

trainSize = TRAIN_SIZE
testSize =  TEST_SIZE
tweetSize = TWEET_SIZE
saveDir = YOUR_SAVE_DIR

train = generateDataset(trainSize, tweetSize)
print("\n\nFINISHED TRAIN\n\n")
test = generateDataset(testSize, tweetSize)
print("\n\nFINISHED TEST\n\n")

syntheticDataset = DatasetDict({
    'train': train,
    'test': test
})

syntheticDataset.save_to_disk(saveDir)
syntheticDataset