In [1]:
!pip install datasets
!pip install transformers[sentencepiece]
!pip install --upgrade transformers
!pip install einops
!pip install openai
!pip install evaluate
!pip install torch
!pip install huggingface_hub
!pip install sentencepiece

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Downloading pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Downloading tqdm-4.66.5-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manyli

In [16]:
from datasets import load_dataset, load_from_disk, concatenate_datasets
import pandas as pd
import numpy as np

from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login

import evaluate
import torch
import torch.nn as nn

import zipfile
import tempfile
import shutil
import os
import re

In [3]:
def load_and_extract_dataset(zip_path):
    # Create a temporary directory
    temp_dir = '/tmp/dataset_extracted'
    
    # Ensure the directory is clean
    if os.path.exists(temp_dir):
        shutil.rmtree(temp_dir)
    os.makedirs(temp_dir)
    
    # Unzip the file to the temporary directory
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(temp_dir)

    # Load the dataset from the extracted directory
    dataset = load_from_disk(temp_dir)

    return dataset

In [4]:
dataset = load_and_extract_dataset("dataset (1).zip")
# tokenized_datasets = load_and_extract_dataset("tokenized_datasets2.zip")

In [36]:
def sampleDataset(samples):
    train_dataset = dataset['train']

    dataset_sentiment_0 = train_dataset.filter(lambda x: x['label'] == 0)
    dataset_sentiment_1 = train_dataset.filter(lambda x: x['label'] == 1)

    dataset_sampled_0 = dataset_sentiment_0.shuffle().select(range(samples//2))
    dataset_sampled_1 = dataset_sentiment_1.shuffle().select(range(samples//2))

    dataset_combined = concatenate_datasets([dataset_sampled_0, dataset_sampled_1])

    sampled_dataset = dataset_combined.shuffle()

    print("Positive: ", sum(1 for example in sampled_dataset if example['label'] == 1))
    print("Negative: ", sum(1 for example in sampled_dataset if example['label'] == 0))
    print(sampled_dataset)

    return sampled_dataset
    

In [37]:
sampled_dataset = sampleDataset(1000)

Positive:  500
Negative:  500
Dataset({
    features: ['text', 'label'],
    num_rows: 1000
})


In [85]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "microsoft/Phi-3-mini-4k-instruct"
# model_name = "mistralai/Mistral-7B-Instruct-v0.3"
# model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
# model_name = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

torch.cuda.empty_cache()

if(torch.cuda.is_available()):
  model = model.cuda()

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [62]:
def promptModel(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    attention_mask = inputs['attention_mask'].to("cuda")

    output_sequences = model.generate(
        input_ids=inputs.input_ids,
        max_length=100,
        do_sample=True,
        attention_mask=attention_mask,
        num_return_sequences=1,
        output_scores=True,
        return_dict_in_generate=True, 
        temperature = 0.9, 
        top_p = 0.9, 
        repetition_penalty=1.2
    )

    generated_tokens = output_sequences.sequences
    generated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
    
    with torch.no_grad():
        # Forward pass to get logits
        logits = model(generated_tokens).logits

    return generated_text, logits

In [51]:
def findTweets(generated_text, isPositive):
    output = generated_text.replace('“', '"').replace('”', '"')

    if(isPositive):
        positive_match = re.search(r'Positive:\s*"\s*([^"]*)\s*"', output, re.DOTALL)
        positive_tweet = positive_match.group(1).strip() if positive_match else "-1"
        return positive_tweet

    else:
        negative_match = re.search(r'Negative:\s*"\s*([^"]*)\s*"', output, re.DOTALL)
        negative_tweet = negative_match.group(1).strip() if negative_match else "-1"
        return negative_tweet

In [52]:
def generateAndExtractTweets(positivePrompt, negativePrompt,  batchSize):

    outputs = []
    promptUsed = ""
    isPositive = True
    
    for i in range (batchSize):
        
        if(i <= 2):
            promptUsed = positivePrompt
            isPositive = True
        else:
            promptUsed = negativePrompt
            isPositive = False
            
        while True:
            generated_text, logits = promptModel(promptUsed)
            tweet = findTweets(generated_text, isPositive)
        
            if tweet != "-1" and len(tweet) > 0:
                newOutput = [tweet, logits]
                outputs.append(newOutput)
                break

    return outputs

In [53]:
import torch.optim as optim
from transformers import AdamW

class Discriminator(nn.Module):
  def __init__(self, vocab_size, embed_size):
    super(Discriminator, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embed_size)
    self.lstm = nn.LSTM(embed_size, 128, batch_first=True)
    self.fc = nn.Linear(128, 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, input_ids):
    embeds = self.embedding(input_ids)
    _, (hidden, _) = self.lstm(embeds)
    output = self.fc(hidden[-1])
    return self.sigmoid(output)
#generator and discriminator instantiation
generator = model

if(torch.cuda.is_available()):
  discriminator = Discriminator(len(tokenizer), 768).cuda()
else:
    discriminator = Discriminator(len(tokenizer), 768)

#optimizers
optimizerG = AdamW(generator.parameters(), lr=5e-5)
optimizerD = optim.Adam(discriminator.parameters(), lr=5e-5)
#loss function
criterion = nn.BCEWithLogitsLoss()
data_loader = torch.utils.data.DataLoader(sampled_dataset, batch_size=1, shuffle=True)



In [44]:
for batch in data_loader:
    print(batch.keys())
    break

dict_keys(['text', 'label'])


In [12]:
sampled_dataset[370]

{'text': 'Not feeling to good, I think I hurt my hand pretty bad ... typing is not fun right now, not a good thing for a writer  ',
 'label': 0}

In [83]:
positivePrompt = '''
Generate a positive social media tweet on a specific topic. Ensure your tweet is enclosed in straight double quotation marks. Provide ONLY one tweet. The positive tweet should express enthusiasm or praise. 

Positive: "'''

In [41]:
negativePrompt = '''
Generate a negative social media tweet on a specific topic. Ensure your tweet is enclosed in straight double quotation marks and separated by a colon. Provide ONLY one tweet. The negative tweet should express convey criticism or disappointment. 

Negative: "'''

In [90]:
output = promptModel(positivePrompt)
output[0]

'\nGenerate a positive social media tweet on a specific topic. Ensure your tweet is enclosed in straight double quotation marks. Provide ONLY one tweet. The positive tweet should express enthusiasm or praise. \n\nPositive: "Thrilled to see our team\'s dedication and hard work lead us towards success! Keep shining, everyone!" #TeamSpirit @CompanyName'

In [148]:
output = generateAndExtractTweets(prompt, prompt, 2)

output

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[["We're celebrating National Teacher Appreciation Week by recognizing the hard work of all educators who dedicate their lives to shaping young minds! Their selflessness, patience & love inspire us every day",
  tensor([[[-2.9512,  1.5942,  7.4129,  ...,  1.6402,  1.6403,  1.6403],
           [ 7.8343,  6.6701, 13.2224,  ..., -5.3764, -5.3765, -5.3765],
           [ 0.9604, -0.9753,  0.2828,  ..., -6.5249, -6.5248, -6.5249],
           ...,
           [ 4.2491,  4.9086,  1.4445,  ..., -5.6226, -5.6225, -5.6227],
           [ 4.6136,  4.7835,  1.6697,  ..., -4.5970, -4.5970, -4.5973],
           [ 2.2296,  4.5771,  2.0076,  ..., -3.9358, -3.9359, -3.9360]]],
         device='cuda:0')],
 ["We are all born with the power to choose our own path! Let's support each other along the way, every step of this incredible journey we call life.",
  tensor([[[-2.9512,  1.5942,  7.4129,  ...,  1.6402,  1.6403,  1.6403],
           [ 7.8343,  6.6701, 13.2224,  ..., -5.3764, -5.3765, -5.3765],
        

In [17]:
def tokenizeData(texts):
    return tokenizer(texts, truncation=True, padding=True, return_tensors='pt')

In [129]:
for batch in data_loader:
    print(batch) 
    break

{'text': ['Me and Momo just saw a crazy ass bitch at Walmart! All bad; but we got most of the stuff on our list '], 'label': tensor([1])}


In [134]:
import torch
import numpy as np
import os

# For more accurate error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ["PYTORCH_USE_CUDA_DSA"] = "1"

prompt = 'Generate one positive tweet and one negative tweet on a very specific topic. Ensure that each tweet is enclosed in straight quotation marks (""). The positive tweet should express enthusiasm or praise, and the negative tweet should convey criticism or disappointment. Your comments should include specific features, aspects, or things that are praised/critisized. Make sure to include a space after the colon. \nPositive: "'
batches = 1
epochs = 5
for epoch in range(epochs):
    for batch in data_loader:
        
        # Convert the input batch into a PyTorch tensor and move to GPU
        padded_inputs = tokenizer(batch['text'], padding=True, return_tensors="pt", truncation=True)
        real_text = padded_inputs['input_ids'].cuda()

        # Generate fake text logits using noise input
        noise = torch.randint(0, len(tokenizer), (1, 16)).cuda()
        outputs = generator(noise)
        fake_logits = outputs.logits.argmax(dim=-1).detach().cuda()  # Detach to avoid backpropagation
        fake_logits_raw = outputs.logits.cuda()  # Raw logits before argmax for shape consistency
        
        # Reset gradients for discriminator
        discriminator.zero_grad()

        # Create real labels tensor and move to GPU
        real_labels = torch.ones((real_text.size(0), 1), dtype=torch.float).cuda()

        # Get discriminator's prediction on real text and calculate loss
        real_output = discriminator(real_text)
        lossD_real = criterion(real_output.view(-1, 1), real_labels)
        
        # Reshape fake logits to ensure correct shape
        fake_logits = fake_logits.view(-1, 1).cuda()

        # Get discriminator's prediction on fake text and calculate loss
        fake_output = discriminator(fake_logits)
        
        # Create fake labels tensor and move to GPU
        fake_labels = torch.zeros((fake_output.size(0), 1), dtype=torch.float).cuda()
        
        lossD_fake = criterion(fake_output, fake_labels)

        # Combine real and fake losses for the discriminator
        lossD = lossD_real + lossD_fake
        lossD.backward()
        
        torch.nn.utils.clip_grad_norm_(discriminator.parameters(), max_norm=1.0)
        
        optimizerD.step()

        # Reset gradients for generator
        generator.zero_grad()

        # Generate new fake text logits
        with torch.no_grad():
            outputs = generator(noise)
        fake_logits = outputs.logits.argmax(dim=-1).cuda()

        # Get discriminator's assessment of the newly generated fake data
        fake_output = discriminator(fake_logits.view(-1, 1).cuda())

        real_labels = torch.ones((fake_output.size(0), 1), dtype=torch.float).cuda()

        # Calculate the generator's loss

        lossG = criterion(fake_output.view(-1, 1), real_labels)
        lossG.backward()
        
        torch.nn.utils.clip_grad_norm_(generator.parameters(), max_norm=1.0)

        optimizerG.step()
        
        print(f'Epoch [{epoch}/{epochs}] Loss_D: {lossD.item():.4f} Loss_G: {lossG.item():.4f}')
        print("BATCH NUMBER " + str(batches))
        batches+=1
        del padded_inputs, real_text, noise, outputs, fake_logits, fake_logits_raw, real_labels, real_output, fake_labels, fake_output
        # gc.collect(generation=2)
        torch.cuda.empty_cache()

Epoch [0/5] Loss_D: 1.4040 Loss_G: 0.4741
BATCH NUMBER 1
Epoch [0/5] Loss_D: 1.4569 Loss_G: 0.4639
BATCH NUMBER 2
Epoch [0/5] Loss_D: 1.4063 Loss_G: 0.4734
BATCH NUMBER 3
Epoch [0/5] Loss_D: 1.4101 Loss_G: 0.4698
BATCH NUMBER 4
Epoch [0/5] Loss_D: 1.4122 Loss_G: 0.4699
BATCH NUMBER 5
Epoch [0/5] Loss_D: 1.3919 Loss_G: 0.4779
BATCH NUMBER 6
Epoch [0/5] Loss_D: 1.3755 Loss_G: 0.4774
BATCH NUMBER 7
Epoch [0/5] Loss_D: 1.3977 Loss_G: 0.4715
BATCH NUMBER 8
Epoch [0/5] Loss_D: 1.4558 Loss_G: 0.4775
BATCH NUMBER 9
Epoch [0/5] Loss_D: 1.4432 Loss_G: 0.4721
BATCH NUMBER 10
Epoch [0/5] Loss_D: 1.3837 Loss_G: 0.4801
BATCH NUMBER 11
Epoch [0/5] Loss_D: 1.4556 Loss_G: 0.4444
BATCH NUMBER 12
Epoch [0/5] Loss_D: 1.3989 Loss_G: 0.4685
BATCH NUMBER 13
Epoch [0/5] Loss_D: 1.3945 Loss_G: 0.4773
BATCH NUMBER 14
Epoch [0/5] Loss_D: 1.3883 Loss_G: 0.4724
BATCH NUMBER 15
Epoch [0/5] Loss_D: 1.4492 Loss_G: 0.4718
BATCH NUMBER 16
Epoch [0/5] Loss_D: 1.3905 Loss_G: 0.4719
BATCH NUMBER 17
Epoch [0/5] Loss_D: 1.3

In [None]:
torch.save(model, 'model.pth')

In [135]:
#Epoch [4/5] Loss_D: 1.2244 Loss_G: 0.5528

In [4]:
model = torch.load('model.pth')

RuntimeError: PytorchStreamReader failed reading zip archive: failed finding central directory

In [152]:
generator.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (n

In [12]:
batch_size = 1
noise_dim = 32  # replace with the appropriate noise dimension used during training
noise = torch.randint(0, len(tokenizer), (batch_size, noise_dim)).to('cuda')

# Run the generator to generate new data
with torch.no_grad():  # no gradient calculation needed during inference
    outputs = generator(noise)

# Post-process the output if necessary
generated_text_logits = outputs.logits
generated_text_ids = generated_text_logits.argmax(dim=-1)

# Convert generated token IDs to text using the tokenizer
generated_text = tokenizer.batch_decode(generated_text_ids, skip_special_tokens=True)

# Print or use the generated text
for i, text in enumerate(generated_text):
    print(f"Generated Text {i+1}: {text}")


Generated Text 1: !
