# Unsupervised, Supervised, Finetuned

In [1]:
import pandas as pd
from datasets import Dataset

# Unsupervised data (just text)
unsupervised_data = pd.DataFrame({"text": ["Your text data for pretraining"]})
unsupervised_dataset = Dataset.from_pandas(unsupervised_data)

# Supervised data (text with labels)
supervised_data = pd.DataFrame({
    "text": ["Input text for fine-tuning"],
    "label": [1]  # Your labels (e.g., 1, 0 for binary classification)
})
supervised_dataset = Dataset.from_pandas(supervised_data)

In [2]:
import torch
print(torch.__version__) 


2.4.1


In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Load the pretrained tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Use eos_token as the pad_token
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained("gpt2")

# Tokenize the unsupervised dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], return_special_tokens_mask=True, truncation=True, padding="max_length", max_length=128)

unsupervised_dataset = unsupervised_dataset.map(tokenize_function, batched=True)

# Data collator for language modeling (this automatically creates the labels from input_ids)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Set to False because GPT-2 is a causal language model, not masked
)

# Pretraining setup
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=unsupervised_dataset,
    data_collator=data_collator,  # Ensure labels are created
)

# Train the model (pretraining)
trainer.train()

# Save the pretrained model
model.save_pretrained("./unsupervised_model")
tokenizer.save_pretrained("./unsupervised_model")

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'train_runtime': 5.8666, 'train_samples_per_second': 0.17, 'train_steps_per_second': 0.17, 'train_loss': 6.993592262268066, 'epoch': 1.0}


('./unsupervised_model/tokenizer_config.json',
 './unsupervised_model/special_tokens_map.json',
 './unsupervised_model/vocab.json',
 './unsupervised_model/merges.txt',
 './unsupervised_model/added_tokens.json',
 './unsupervised_model/tokenizer.json')

In [8]:
print(f"Number of samples in dataset: {len(supervised_dataset)}")

Number of samples in dataset: 1


In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset
import pandas as pd

# Supervised data (text with labels) - one sentence
supervised_data = pd.DataFrame({
    "text": ["Input text for fine-tuning"],  # Your single input text
    "label": [1]  # Binary label
})
supervised_dataset = Dataset.from_pandas(supervised_data)

# Load the pretrained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("./unsupervised_model")

# Ensure that the tokenizer uses the eos_token as the pad_token
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForSequenceClassification.from_pretrained("./unsupervised_model", num_labels=2)

# Tokenize the supervised dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True)

# Apply tokenization
supervised_dataset = supervised_dataset.map(tokenize_function, batched=True)

# Data collator for padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Fine-tuning setup (skip evaluation since we only have one sample)
training_args = TrainingArguments(
    output_dir="./results_finetune",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    evaluation_strategy="no",  # Skip evaluation
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=supervised_dataset,  # Use the entire dataset for training
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./finetuned_model")
tokenizer.save_pretrained("./finetuned_model")

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at ./unsupervised_model and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

  0%|          | 0/3 [00:00<?, ?it/s]

{'train_runtime': 2.7097, 'train_samples_per_second': 1.107, 'train_steps_per_second': 1.107, 'train_loss': 9.622931798299154, 'epoch': 3.0}


('./finetuned_model/tokenizer_config.json',
 './finetuned_model/special_tokens_map.json',
 './finetuned_model/vocab.json',
 './finetuned_model/merges.txt',
 './finetuned_model/added_tokens.json',
 './finetuned_model/tokenizer.json')

In [10]:
from transformers import pipeline

# Load fine-tuned model
classifier = pipeline("text-classification", model="./finetuned_model")

# Make predictions
predictions = classifier("Your input text for prediction")
print(predictions)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'LABEL_0', 'score': 0.9999710321426392}]


In [11]:
label_map = {0: "negative", 1: "positive"}  # Adjust this mapping to match your labels

# Process the prediction output
for prediction in predictions:
    label_id = int(prediction['label'].split('_')[-1])  # Extract the numeric part of the label
    prediction['label'] = label_map[label_id]

print(predictions)  # Updated output with meaningful labels

[{'label': 'negative', 'score': 0.9999710321426392}]


In [12]:
classifier = pipeline("text-classification", model="./finetuned_model", device=0)  # Use GPU (if available)
predictions = classifier("Your input text for prediction")
print(predictions)

[{'label': 'LABEL_0', 'score': 0.9999710321426392}]


In [14]:
new_input = "This is another input text to classify again"
predictions = classifier(new_input)
print(predictions)

[{'label': 'LABEL_0', 'score': 0.999996542930603}]


99% sure we are label_0