# Setup

In [1]:
!pip install datasets
!pip install transformers[sentencepiece]
!pip install --upgrade transformers
!pip install einops
!pip install openai
!pip install evaluate
!pip install torch
!pip install huggingface_hub
!pip install sentencepiece
!pip install scikit-learn
!pip install accelerate -U
!pip install transformers[torch]

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Downloading pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Downloading tqdm-4.66.5-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16

In [2]:
from datasets import load_dataset, load_from_disk, concatenate_datasets, Dataset
from functools import partial
import pandas as pd
import numpy as np

from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login

import evaluate
import torch
import torch.nn as nn

import zipfile
import tempfile
import shutil
import os
import re

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
def load_and_extract_dataset(zip_path):
    # Create a temporary directory
    temp_dir = '/tmp/dataset_extracted'
    
    # Ensure the directory is clean
    if os.path.exists(temp_dir):
        shutil.rmtree(temp_dir)
    os.makedirs(temp_dir)
    
    # Unzip the file to the temporary directory
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(temp_dir)

    # Load the dataset from the extracted directory
    dataset = load_from_disk(temp_dir)

    return dataset

In [4]:
dataset = load_and_extract_dataset("dataset (4).zip")

In [5]:
positivePrompt = '''
Generate a positive social media tweet on a specific topic. The positive tweet should express enthusiasm or praise. Model your format after these examples:
'''

negativePrompt = '''
Generate a negative social media tweet on a specific topic. The negative tweet should express convey criticism or disappointment. Model your format after these examples: '''

def sampleDataset(samples, k, positivePrompt, negativePrompt, train_dataset = dataset["train"]):
    
    dataset_sentiment_0 = train_dataset.filter(lambda x: x['label'] == 0)
    dataset_sentiment_1 = train_dataset.filter(lambda x: x['label'] == 1)

    dataset_sampled_0 = dataset_sentiment_0.shuffle().select(range(samples//2))
    dataset_sampled_1 = dataset_sentiment_1.shuffle().select(range(samples//2))

    dataset_combined = concatenate_datasets([dataset_sampled_0, dataset_sampled_1])

    sampled_dataset = dataset_combined.shuffle()

    print("Positive: ", sum(1 for example in sampled_dataset if example['label'] == 1))
    print("Negative: ", sum(1 for example in sampled_dataset if example['label'] == 0))
    print(sampled_dataset)

    examples_0 = dataset_sentiment_0.shuffle().select(range(k))["text"]
    examples_1 = dataset_sentiment_1.shuffle().select(range(k))["text"]
    
    for entry in examples_1:
        positivePrompt += f'Positive: "{entry}"\n'

    for entry in examples_0:
        negativePrompt += f'Negative: "{entry}"\n'

    plainPositive = positivePrompt
    plainNegative = negativePrompt
    
    positivePrompt += 'Positive: "'
    negativePrompt += 'Negative: "'
    
    return sampled_dataset, positivePrompt, negativePrompt, plainPositive, plainNegative

In [6]:
sampled_dataset, positivePrompt, negativePrompt, plainPositive, plainNegative = sampleDataset(1000, 5, positivePrompt, negativePrompt)

Filter:   0%|          | 0/1600000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1600000 [00:00<?, ? examples/s]

Positive:  500
Negative:  500
Dataset({
    features: ['text', 'label'],
    num_rows: 1000
})


In [7]:
def promptModel(prompt, length):
    inputs = tokenizer(prompt, return_tensors='pt').to(device)
    attention_mask = inputs['attention_mask'].to(device)

    outputs = model.generate(
        inputs['input_ids'],
        do_sample=True,
        attention_mask=attention_mask,
        num_return_sequences=1,
        max_length=length,
        temperature = 0.9, 
        top_p = 0.9, 
        repetition_penalty=1.2,
        output_scores=True,
        return_dict_in_generate=True,
        pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id
    )

    # Decode the generated text
    text = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)

    with torch.no_grad():
        # Forward pass to get logits
        logits = model(outputs.sequences).logits


    return text, logits

In [8]:
def findTweets(generated_text, isPositive, plainPrompt):
    output = generated_text.replace(plainPrompt, "").replace('“', '"').replace('”', '"')

    if(isPositive):
        positive_match = re.search(r'Positive:\s*"\s*([^"]*)\s*"', output, re.DOTALL)
        positive_tweet = positive_match.group(1).strip() if positive_match else "-1"
        return positive_tweet

    else:
        negative_match = re.search(r'Negative:\s*"\s*([^"]*)\s*"', output, re.DOTALL)
        negative_tweet = negative_match.group(1).strip() if negative_match else "-1"
        return negative_tweet

In [9]:
def generateAndExtractTweets(prompt, label, length, plainPrompt):
            
    while True:
        text, logits = promptModel(prompt, length)
        tweet = findTweets(text, label == 1, plainPrompt)
        
        if tweet != "-1" and len(tweet) > 0:
            length = len(tokenizer.encode(tweet, add_special_tokens=True))
            text, logits = promptModel(tweet, length + 1)  
            return text, logits

# Load up dataset and model

In [10]:
dataset.save_to_disk('/content/dataset')

Saving the dataset (0/1 shards):   0%|          | 0/1600000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/359 [00:00<?, ? examples/s]

In [10]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
login("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")

model_name = "microsoft/Phi-3-mini-4k-instruct"
# model_name = "mistralai/Mistral-7B-Instruct-v0.3"
# model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
# model_name = "distilbert/distilbert-base-uncased"
# model_name = "mistralai/Mistral-Nemo-Instruct-2407"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

torch.cuda.empty_cache()

if(torch.cuda.is_available()):
  model = model.cuda()

if tokenizer.pad_token is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [11]:
# model.load_state_dict(torch.load('mistral KGAN/generator.pth'))
model.load_state_dict(torch.load('Phi- KGAN/generator.pth'))

<All keys matched successfully>

In [None]:
print(positivePrompt)

In [12]:
generateAndExtractTweets(positivePrompt, 1, 230, plainPositive)

You are not running the flash-attention implementation, expect numerical differences.
We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


('Love hearing people talk about my music and how much they enjoy listening to me sing,',
 tensor([[[16.9435, 16.3223, 18.8655,  ..., 12.7078, 12.7083, 12.7086],
          [32.5159, 32.2229, 33.3793,  ..., 29.7334, 29.7398, 29.7409],
          [35.8262, 32.1499, 33.9334,  ..., 29.8708, 29.8761, 29.8798],
          ...,
          [34.9197, 34.1015, 31.7473,  ..., 28.3931, 28.3976, 28.3993],
          [33.7171, 32.7334, 31.5962,  ..., 29.2211, 29.2272, 29.2288],
          [34.2333, 35.3409, 34.3102,  ..., 30.2819, 30.2872, 30.2892]]],
        device='cuda:0'))

In [49]:
import shutil

syntheticDataset = {
    'text': [],
    'label': []
}

for i in range (500):
    text, logits = generateAndExtractTweets(positivePrompt, 1, 230, plainPositive)
    syntheticDataset["text"].append(text)
    syntheticDataset["label"].append(1)
    if(i % 20 == 0):
        print(i)

print("FINISHED POSITIVE")

for i in range (500):
    text, logits = generateAndExtractTweets(negativePrompt, 0, 230, plainNegative)
    syntheticDataset["text"].append(text)
    syntheticDataset["label"].append(0)
    if(i % 20 == 0):
        print(i)

syntheticDataset = Dataset.from_dict(syntheticDataset)
syntheticDataset= syntheticDataset.shuffle()
print(syntheticDataset)

syntheticDataset.save_to_disk("MistralCombined")

KeyboardInterrupt: 

In [None]:
test_dataset.save_to_disk("test")

In [13]:
syntheticDataset = Dataset.load_from_disk("PhiCombined")

In [14]:
dataset = load_and_extract_dataset("dataset (4).zip")

In [15]:
def resetPrompts():
    positivePrompt = '''
    Generate a positive social media tweet on a specific topic. The positive tweet should express enthusiasm or praise. Model your format after these examples:'''

    negativePrompt = '''
    Generate a negative social media tweet on a specific topic. The negative tweet should express convey criticism or disappointment. Model your format after these examples: '''

    return positivePrompt, negativePrompt


In [16]:
from sklearn.model_selection import train_test_split
import torch
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from datasets import Dataset
import pandas as pd
from datasets import Dataset, Value

In [93]:
syntheticDataset.features['label'].dtype

'int64'

In [51]:
syntheticDataset

{'text': [], 'label': []}

In [17]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Assuming `dataset` is your Hugging Face dataset
def convert_label_to_int32(example):
    example['label'] = np.int32(example['label'])
    return example


syntheticDataset = syntheticDataset.cast_column('label', Value("int32"))


# Apply the transformation to the dataset
syntheticDataset = syntheticDataset.map(convert_label_to_int32)
synthetic_0 = syntheticDataset.filter(lambda x: x['label'] == 0)
synthetic_1 = syntheticDataset.filter(lambda x: x['label'] == 1)

positivePrompt, negativePrompt = resetPrompts()

one, x, y, z, a = sampleDataset(500, 5, positivePrompt, negativePrompt)
# one = one.map(lambda example: {**example, 'label': 1})

two = concatenate_datasets([synthetic_0.select(range(250)), synthetic_1.select(range(250))])
# two = two.map(lambda example: {**example, 'label': 0})

train_dataset = concatenate_datasets([one,two])

three, x, y, z, a = sampleDataset(150, 5, positivePrompt, negativePrompt)
# three = three.map(lambda example: {**example, 'label': 1})

four = concatenate_datasets([synthetic_0.select(range(250, 325)), synthetic_1.select(range(250, 325))])
# four = four.map(lambda example: {**example, 'label': 0})

test_dataset = concatenate_datasets([three,four])

train_dataset = train_dataset.shuffle()
test_dataset = test_dataset.shuffle()

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Filter:   0%|          | 0/1600000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1600000 [00:00<?, ? examples/s]

Positive:  250
Negative:  250
Dataset({
    features: ['text', 'label'],
    num_rows: 500
})
Positive:  75
Negative:  75
Dataset({
    features: ['text', 'label'],
    num_rows: 150
})


In [18]:
def format_labels(example):
    example['label'] = torch.tensor(example['label'], dtype=torch.long)
    return example
    
train_dataset = train_dataset.map(format_labels)
test_dataset = test_dataset.map(format_labels)

def preprocess_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)


# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps = 20,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)
trainer.can_return_loss = True
# Fine-tune the model
trainer.train()
# Save the tokenizer and model

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,0.4984,0.503792
2,0.4236,0.738301
3,0.1994,0.978852
4,0.0707,1.050919
5,0.0006,1.10445


TrainOutput(global_step=835, training_loss=0.2286167025644176, metrics={'train_runtime': 86.1733, 'train_samples_per_second': 58.023, 'train_steps_per_second': 9.69, 'total_flos': 328888819200000.0, 'train_loss': 0.2286167025644176, 'epoch': 5.0})

In [58]:
dataset["train"]["text"][5]

'@Kwesidei not the whole crew '

In [23]:
from torch.nn.functional import softmax
import random


def evaluate_comment(comment):
    inputs = tokenizer(comment, return_tensors='pt', padding=True, truncation=True, max_length=128)
    inputs = inputs.to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = softmax(outputs.logits, dim=1)
        prediction = torch.argmax(probs, dim=1).item()

    return prediction

    
results = []
index = 0
for i in range (5):
    numberCorrect = 0
    for k in range(10):
        response = evaluate_comment(dataset["test"]["text"][index])

        

        print(response)

        if(response == dataset["train"]["label"][randomIndex]): 
            numberCorrect += 1

        print("Correct: ", dataset["train"]["label"][randomIndex])
        print("Answered: ", response)
            
        index += 1
        
    percentage = numberCorrect/10
    print(percentage)
    results.append(percentage)
            

1
Correct:  0
Answered:  1
1
Correct:  0
Answered:  1
1
Correct:  0
Answered:  1
1
Correct:  0
Answered:  1
1
Correct:  0
Answered:  1
0
Correct:  0
Answered:  0
0
Correct:  0
Answered:  0
1
Correct:  0
Answered:  1
1
Correct:  0
Answered:  1
0
Correct:  0
Answered:  0
0.3
0
Correct:  0
Answered:  0
1
Correct:  0
Answered:  1
1
Correct:  0
Answered:  1
0
Correct:  0
Answered:  0
0
Correct:  0
Answered:  0
1
Correct:  0
Answered:  1
1
Correct:  0
Answered:  1
1
Correct:  0
Answered:  1
0


KeyboardInterrupt: 

In [27]:
0.9 + 0.9 +1

2.8