In [17]:
import subprocess
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset, Dataset
from torch.utils.data import DataLoader
import numpy as np

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

LM_MODEL = "august66/qwen2-sft-final"
SENTIMENT_MODEL = "siebert/sentiment-roberta-large-english"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

sft_model = AutoModelForCausalLM.from_pretrained(
    LM_MODEL,
    torch_dtype = torch.float16
)
sft_tokenizer = AutoTokenizer.from_pretrained("august66/qwen2-sft-final") 
sft_tokenizer.pad_token = sft_tokenizer.eos_token
sft_model.config.pad_token_id = sft_model.config.eos_token_id
sentiment_tokenizer = AutoTokenizer.from_pretrained(SENTIMENT_MODEL)
sentiment_model = AutoModelForSequenceClassification.from_pretrained(SENTIMENT_MODEL)

dataset_test = load_dataset("stanfordnlp/imdb", split="test")
def prompt_completion_preprocess(example):
    words = example['text'].split()
    prompt = ' '.join(words[:5])
    completion = ' '.join(words[5:])
    return {'prompt': prompt, 'completion': completion}
dataset_test = dataset_test.map(prompt_completion_preprocess, remove_columns=['text', 'label'])


print(f"Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
print(f"Reserved : {torch.cuda.memory_reserved()  / 1024**3:.2f} GB")

Allocated: 0.01 GB
Reserved : 1.75 GB


In [None]:
from tqdm import tqdm
def tokenize(batch):

    inputs = sft_tokenizer(
        batch['prompt'],
        padding = 'max_length',
        truncation = True,
        max_length = 128,
        padding_side = 'left',
        add_special_tokens = True, 
        return_tensors = 'pt'
    )

    return {k:v for k, v in inputs.items()}

tokenized_inputs = dataset_test.map(
    tokenize, 
    batched = True,
    batch_size = 32, 
    remove_columns = ['prompt', 'completion']
).with_format('torch', columns = ['input_ids', 'attention_mask'])


tokenized_inputs_dataloader = DataLoader(
    tokenized_inputs,
    batch_size = 32, 
    shuffle = True,
    pin_memory = True 
)

decoded_outputs = []
with torch.inference_mode():
    sft_model = sft_model.to(device)
    for input_batch in tqdm(tokenized_inputs_dataloader):
        input_batch = {k:v.to(device) for k, v in input_batch.items()}
        batch_size = input_batch['input_ids'].shape[0]

        outputs = sft_model.generate(
            **input_batch,
            num_return_sequences = 2, 
            do_sample = True,
            top_k=50,
            top_p=0.95,
            temperature=1.0,
            use_cache = True,
            pad_token_id = sft_model.config.eos_token_id,
            max_new_tokens=10
        ).view(batch_size * 2, -1).cpu()
        
        decoded_outputs_batch = sft_tokenizer.batch_decode(
            outputs,
            skip_special_tokens = True
        )

        decoded_outputs.extend(decoded_outputs_batch)

prompt_completion_data = [
    {
        'prompt':prompt,
        'completion_1':decoded_outputs[2*index],
        'completion_2':decoded_outputs[2*index+1]
    }
    for index, prompt in enumerate(dataset_test['prompt'])
]

prompt_completion_dataset = Dataset.from_list(prompt_completion_data)






100%|██████████| 782/782 [03:22<00:00,  3.86it/s]


In [None]:
completions_dataset = prompt_completion_dataset.select_columns(['completion_1', 'completion_2'])
completions_dataset_flat = [x for row in completions_dataset for x in row]
tokenized_completion = sentiment_tokenizer(
    text = completions_dataset_flat,
    max_length = 128,
    add_special_tokens = True,
    return_tensors = 'pt',
    padding = True, 
    truncation = True,
    padding_side = 'left'
)




In [20]:
completions_arr = np.array(prompt_completion_data.select_columns(['completion_1', 'completion_2']))

In [24]:
completions_arr

array([{'completion_1': 'Sure, this film was retarded. A retarded film. It was so retarded that I', 'completion_2': "Sure, this film was retarded. It wasn't very original. I mean... I"},
       {'completion_1': 'This review is based on the Japanese Region 1 DVD version only. The', 'completion_2': 'This review is based on the novel by Richard Condon.<br /><br'},
       {'completion_1': 'Viva Variety was a unique production. You have the occasional movie from the mid', 'completion_2': 'Viva Variety was a unique program. It showcased the best of the local and'},
       ...,
       {'completion_1': 'How to qualify this film, as "worst movie ever made" I can', 'completion_2': 'How to qualify this film, you would have to have been involved in a cinematic'},
       {'completion_1': 'Imagine every stereotypical, overacted cliche about the Mafia or the underbelly of American', 'completion_2': 'Imagine every stereotypical, overacted cliche about Hitler that comes out every year (especially from'

In [14]:
import torch.nn.functional as F
def get_sentiment(text: str):
    # Tokenize and prepare inputs
    inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    # Forward pass
    with torch.no_grad():
        logits = sentiment_model(**inputs).logits
    # Convert to probabilities
    probs = F.softmax(logits, dim=-1)[0]
    # Decode prediction
    label = "positive" if probs[1] > probs[0] else "negative"
    return label, probs.cpu().tolist()

# 🔍 Example:
print(get_sentiment(completions[1]))

('positive', [0.0011193063110113144, 0.9988806843757629])


In [78]:
sentiment_tokenizer?

[0;31mSignature:[0m     
[0msentiment_tokenizer[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mtext[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m][0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtext_pair[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m][0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtext_target[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m][0m[0;34m,[0m [0mNoneType