#### This notebook generates and stores a synthetic dataset for either a finetuned GAN model or regular model. 

In [None]:
!pip install datasets
!pip install transformers[sentencepiece]
!pip install --upgrade transformers
!pip install einops
!pip install torch
!pip install huggingface_hub
!pip install sentencepiece

In [6]:
from datasets import load_dataset, load_from_disk, concatenate_datasets, Dataset, DatasetDict

import pandas as pd
import numpy as np

from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login

import os
import re


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
dataset = DatasetDict.load_from_disk("dataset")

In [9]:
def resetPrompts(needsExamples):
    if(needsExamples):
        positivePrompt = '''Generate an enthusiastic and positive social media tweet. Your tweet should express praise or excitement. Use the following examples as a guide for formatting and tone:\n'''
        negativePrompt = '''Generate a disapproving and negative social media tweet. The tweet should convey criticism or disappointment. Use the following examples as a guide for formatting and tone:\n'''

    else:
        positivePrompt = '''Generate an enthusiastic and positive social media tweet. Your tweet should express praise or excitement.\n'''
        negativePrompt = '''Generate a disapproving and negative social media tweet. The tweet should convey criticism or disappointment.\n'''

    return positivePrompt, negativePrompt

In [11]:
# Returns a sampled version of the dataset and a prompt with k examples. 
def sampleDataset(samples, k):
    positivePrompt, negativePrompt = resetPrompts(k != 0)
    
    train_dataset = dataset['train']
    dataset_sentiment_0 = train_dataset.filter(lambda x: x['label'] == 0)
    dataset_sentiment_1 = train_dataset.filter(lambda x: x['label'] == 1)
    
    dataset_sampled_0 = dataset_sentiment_0.shuffle().select(range(samples//2))
    dataset_sampled_1 = dataset_sentiment_1.shuffle().select(range(samples//2))
    sampled_dataset = concatenate_datasets([dataset_sampled_0, dataset_sampled_1]).shuffle()

    print("Positive: ", sum(1 for example in sampled_dataset if example['label'] == 1))
    print("Negative: ", sum(1 for example in sampled_dataset if example['label'] == 0))
    print(sampled_dataset)
    
    examples_0 = dataset_sentiment_0.select(range(k))["text"]
    examples_1 = dataset_sentiment_1.select(range(k))["text"]

    #Add examples only if needed
    if (k !=0): 
        for entry in examples_1:
            positivePrompt += f'Positive: "{entry}"\n'
        for entry in examples_0:
            negativePrompt += f'Negative: "{entry}"\n'
        
    plainPositive = positivePrompt
    plainNegative = negativePrompt
    
    positivePrompt += 'Positive: "'
    negativePrompt += 'Negative: "'
    
    return sampled_dataset, positivePrompt, negativePrompt, plainPositive, plainNegative

In [7]:
k = 0
size = 1000
sampled_dataset, positivePrompt, negativePrompt, plainPositive, plainNegative = sampleDataset(1000, k)

Filter:   0%|          | 0/1600000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1600000 [00:00<?, ? examples/s]

Positive:  500
Negative:  500
Dataset({
    features: ['text', 'label'],
    num_rows: 1000
})


In [12]:
# Load model directly
login(YOUR_KEY)

model_name = "microsoft/Phi-3-mini-4k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

torch.cuda.empty_cache()

if(torch.cuda.is_available()):
  model = model.cuda()

if tokenizer.pad_token is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
def promptModel(prompt, length):
    inputs = tokenizer(prompt, return_tensors='pt').to(device)
    attention_mask = inputs['attention_mask'].to(device)

    outputs = model.generate(
        inputs['input_ids'],
        do_sample=True,
        attention_mask=attention_mask,
        num_return_sequences=1,
        max_length=length,
        temperature = 0.9, 
        top_p = 0.9, 
        repetition_penalty=1.2,
        output_scores=True,
        return_dict_in_generate=True,
        pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id
    )

    # Decode the generated text
    text = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)

    with torch.no_grad():
        # Forward pass to get logits
        logits = model(outputs.sequences).logits


    return text, logits

In [14]:
def findTweets(generated_text, isPositive, plainPrompt):
    output = generated_text.replace(plainPrompt, "").replace('“', '"').replace('”', '"')

    if(isPositive):
        positive_match = re.search(r'Positive:\s*"\s*([^"]*)\s*"', output, re.DOTALL)
        positive_tweet = positive_match.group(1).strip() if positive_match else "-1"
        return positive_tweet

    else:
        negative_match = re.search(r'Negative:\s*"\s*([^"]*)\s*"', output, re.DOTALL)
        negative_tweet = negative_match.group(1).strip() if negative_match else "-1"
        return negative_tweet

In [15]:
def generateAndExtractTweets(prompt, label, length, plainPrompt):
            
    while True:
        text, logits = promptModel(prompt, length)
        tweet = findTweets(text, label == 1, plainPrompt)
        
        if tweet != "-1" and len(tweet) > 0:
            length = len(tokenizer.encode(tweet, add_special_tokens=True))
            text, logits = promptModel(tweet, length + 1)  
            return text, logits

In [16]:
#Generates a dataset given its size and tokens per tweet. It prompts the model, the neccessary amount of times and arranges everything into a Dataset object. 
def generateDataset (size, tokensPerTweet):

    syntheticDataset = {
        'text': [],
        'label': []
    }

    for i in range (size//2):
        text, logits = generateAndExtractTweets(positivePrompt, 1, tokensPerTweet, plainPositive)
        syntheticDataset["text"].append(text)
        syntheticDataset["label"].append(1)
        if(i % 20 == 0):
            print(i)

    print("FINISHED POSITIVE")

    for i in range (size//2):
        text, logits = generateAndExtractTweets(negativePrompt, 0, tokensPerTweet, plainNegative)
        syntheticDataset["text"].append(text)
        syntheticDataset["label"].append(0)
        if(i % 20 == 0):
            print(i)
    print("FINISHED NEGATIVE")

    syntheticDataset = Dataset.from_dict(syntheticDataset).shuffle()

    return syntheticDataset
    

In [22]:
#Generate a full dataset with 15k training datapoints and 2k testing datapoints. 
train = generateDataset(15000, 100)
print("\n\n\n\nFINISHED TRAIN\n\n\n\n")
test = generateDataset(2000, 100)
print("\n\n\n\nFINISHED TEST\n\n\n\n")

syntheticDataset = DatasetDict({
    'train': train,
    'test': test
})

syntheticDataset.save_to_disk(YOUR_PATH)

0
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880
900
920
940
960
980
1000
1020
1040
1060
1080
1100
1120
1140
1160
1180
1200
1220
1240
1260
1280
1300
1320
1340
1360
1380
1400
1420
1440
1460
1480
1500
1520
1540
1560
1580
1600
1620
1640
1660
1680
1700
1720
1740
1760
1780
1800
1820
1840
1860
1880
1900
1920
1940
1960
1980
2000
2020
2040
2060
2080
2100
2120
2140
2160
2180
2200
2220
2240
2260
2280
2300
2320
2340
2360
2380
2400
2420
2440
2460
2480
2500
2520
2540
2560
2580
2600
2620
2640
2660
2680
2700
2720
2740
2760
2780
2800
2820
2840
2860
2880
2900
2920
2940
2960
2980
3000
3020
3040
3060
3080
3100
3120
3140
3160
3180
3200
3220
3240
3260
3280
3300
3320
3340
3360
3380
3400
3420
3440
3460
3480
3500
3520
3540
3560
3580
3600
3620
3640
3660
3680
3700
3720
3740
FINISHED POSITIVE
0
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500


Saving the dataset (0/1 shards):   0%|          | 0/7500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]