In [10]:
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
from datetime import datetime
# import torch
import json
import os
from datasets import load_dataset
import numpy as np
import evaluate
import pandas as pd


# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
model_name = "Llama-3.1-8B-Instruct"
# model_name = "Llama-3.3-70B-Instruct"

model_path = os.path.join("./pretrained_llms", model_name)
data_path = "./data"
data_name = "mteb/tweet_sentiment_extraction"
cache_dir = "./cache"
output_dir="./results"

dataset = load_dataset(data_name, cache_dir=data_path)
# dataset = load_dataset(data_name, cache_dir=data_path, split='train[10:20]')
# dataset = load_dataset(data_name, cache_dir=data_path, split='test')

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=3, torch_dtype="auto", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_path,
                                          add_eos_token=True,
                                          cache_dir=cache_dir)

if tokenizer.pad_token_id is None:
    print("No pad token found in tokenizer, setting pad token to eos token")
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.padding_side = "right"

if model.config.pad_token_id is None:
    print("No pad token found in model, setting pad token to eos token of tokenizer")
    model.config.pad_token_id = tokenizer.pad_token_id
    model.config.padding_side = "right"
    model.config.use_cache = False  # This can help with training stability
    model.resize_token_embeddings(len(tokenizer))


In [None]:
def tokenizer_function(examples):
    return tokenizer(examples['text'], truncation=True)   
# apply tokenizer function on your data
tokenized_data = dataset.map(tokenizer_function, batched=True)

# train = tokenized_data['train'].select(range(10000))
train = tokenized_data['train']
test = tokenized_data['test']

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [None]:
#Verify the tokenizer settings:
print(f"Pad token ID: {tokenizer.pad_token_id}")
print(f"Model pad token ID: {model.config.pad_token_id}")

In [None]:
# # load the accuracy metric
# metric = evaluate.load("accuracy")

# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     return metric.compute(predictions=predictions, references=labels)

In [None]:
# Load accuracy metric
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    # Compute individual metrics
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    precision = precision_score(labels, predictions, average="weighted", zero_division=0)
    recall = recall_score(labels, predictions, average="weighted")
    f1 = f1_score(labels, predictions, average="weighted")
    
    # Return all metrics
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [None]:
training_args = TrainingArguments(
    output_dir="test_trainer",
    learning_rate=1e-5,  # Experiment with different rates
    # lr_scheduler_type="linear",  # Add learning rate scheduling
    # warmup_steps=100,  # Implement learning rate warmup
    optim="adamw_torch",
    weight_decay=0.01,
    num_train_epochs=10,
    save_strategy='steps',
    save_steps=500,   
    eval_strategy='steps',
    logging_steps=250,
    load_best_model_at_end=True,
    save_total_limit=1,
    # eval_steps=50,
    # gradient_accumulation_steps=4,
    per_device_train_batch_size=8, 
    per_device_eval_batch_size=8,   
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
save_model_path = os.path.join("./saved_model", model_name)
trainer.save_model(save_model_path)
tokenizer.save_pretrained(save_model_path)
print(f"Fine-tuned model saved to: {save_model_path}")


In [None]:
# trainer.evaluate(eval_dataset=train) #evaluate train dataset
eval_metrics = trainer.evaluate(eval_dataset=test) #evaluate test dataset

In [3]:
# load saved model
model = AutoModelForSequenceClassification.from_pretrained(
    "./saved_model/Llama-3.1-8B-Instruct",
    num_labels=3,
    torch_dtype="auto",
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(
    "./saved_model/Llama-3.1-8B-Instruct", add_eos_token=True, cache_dir=cache_dir
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
def evaluate_predictions(dataset, results, verbose=False):
    
    correct = 0
    total = len(results)
    confusion_matrix = {
        'negative': {'negative': 0, 'neutral': 0, 'positive': 0},
        'neutral': {'negative': 0, 'neutral': 0, 'positive': 0},
        'positive': {'negative': 0, 'neutral': 0, 'positive': 0}
    }
    
    for i, result in enumerate(results):
        true_label = dataset[i]['label_text']
        # predicted = result['predicted_sentiment'].lower().strip()
        predicted = result['predicted_sentiment']        
        
        # Handle variations in predictions
        if 'positive' in predicted:
            predicted = 'positive'
        elif 'negative' in predicted:
            predicted = 'negative'
        elif 'neutral' in predicted:
            predicted = 'neutral'
        else:
            if verbose:
                print(f"\nText: {result['text']}")
            print(f"Warning: Unexpected prediction format in {i}th data: {predicted}")
            continue
            
        is_correct = true_label == predicted
        if is_correct:
            correct += 1
            
        # Update confusion matrix
        confusion_matrix[true_label][predicted] += 1
        
        if verbose:
            print(f"\nText: {result['text']}")
            print(f"True label: {true_label}")
            print(f"Predicted: {predicted}")
            print(f"Correct: {is_correct}")
    
    # Calculate metrics
    accuracy = correct / total if total > 0 else 0
    
    # Print detailed results
    print("\n=== Evaluation Results ===")
    print(f"Total samples: {total}")
    print(f"Correct predictions: {correct}")
    print(f"Accuracy: {accuracy:.2%}")
    
    # Print confusion matrix
    print("\n=== Confusion Matrix ===")
    print(f"{'True|Pred':<15}{'Negative':<10}{'Neutral':<10}{'Positive':<10}")
    for true_label in ['negative', 'neutral', 'positive']:
        row = confusion_matrix[true_label]
        print(f"{true_label.title():<15}{row['negative']:<10}{row['neutral']:<10}{row['positive']:<10}")        
        
    # Calculate per-class metrics
    print("\n=== Per-Class Metrics ===")
    for label in ['negative', 'neutral', 'positive']:
        true_pos = confusion_matrix[label][label]
        false_pos = sum(conf[label] for l, conf in confusion_matrix.items() if l != label)
        false_neg = sum(confusion_matrix[label].values()) - true_pos
        
        precision = true_pos / (true_pos + false_pos) if (true_pos + false_pos) > 0 else 0
        recall = true_pos / (true_pos + false_neg) if (true_pos + false_neg) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        print(f"\n{label.title()} class:")
        print(f"Precision: {precision:.2%}")
        print(f"Recall: {recall:.2%}")
        print(f"F1-score: {f1:.2%}")
    
    return {
        'accuracy': accuracy,
        'confusion_matrix': confusion_matrix,
        'total_samples': total,
        'correct_predictions': correct
    }

In [8]:
def save_analysis_results(results, model_name, data_name, base_dir="./results"):
    """
    Save analysis results to CSV file with automatic filename generation and collision handling.
    
    Args:
        results: List of dictionaries containing analysis results
        model_name: Name of the model used
        data_name: Name of the dataset used
        base_dir: Directory to save results
    
    Returns:
        str: Path to the saved file
    """
    # Create results directory if it doesn't exist
    os.makedirs(base_dir, exist_ok=True)
    
    # Create base filename
    dataset_prefix = data_name.split('/')[-1][:5]  # Take first 5 letters
    base_filename = f"finetune_{model_name}_{dataset_prefix}"
    
    # Convert results to DataFrame
    df = pd.DataFrame(results)
    
    # Generate filename with collision handling
    filename = f"{base_filename}.csv"
    filepath = os.path.join(base_dir, filename)
    
    if os.path.exists(filepath):
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"{base_filename}_{timestamp}.csv"
        filepath = os.path.join(base_dir, filename)
    
    # Save to CSV
    df.to_csv(filepath, index=False)
    print(f"Results saved to: {filepath}")
    
    return filepath

def load_analysis_results(filepath):
    """
    Load analysis results from CSV file.
    
    Args:
        filepath: Path to the CSV file
    
    Returns:
        list: List of dictionaries containing analysis results
    """
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Results file not found: {filepath}")
    
    df = pd.read_csv(filepath)
    results = df.to_dict('records')
    print(f"Loaded {len(results)} results from: {filepath}")
    
    return results

# Helper function to clean up checkpoints
def cleanup_checkpoints(cache_dir="./cache"):
    """Remove all checkpoint files from the cache directory."""
    if os.path.exists(cache_dir):
        for file in os.listdir(cache_dir):
            if file.startswith("checkpoint_") and file.endswith(".json"):
                os.remove(os.path.join(cache_dir, file))
        print("Checkpoints cleaned up")


def save_metrics(metrics, model_name, data_name, base_dir="./results"):
    """
    Save metrics to JSON file with automatic filename generation and collision handling.
    
    Args:
        metrics: Dictionary containing evaluation metrics
        model_name: Name of the model used
        data_name: Name of the dataset used
        base_dir: Directory to save results
    
    Returns:
        str: Path to the saved file
    """
    # Create results directory if it doesn't exist
    os.makedirs(base_dir, exist_ok=True)
    
    # Create base filename
    dataset_prefix = data_name.split('/')[-1][:5]
    base_filename = f"finetune_{model_name}_{dataset_prefix}_metrics"
    
    # Generate filename with collision handling
    filename = f"{base_filename}.json"
    filepath = os.path.join(base_dir, filename)
    
    if os.path.exists(filepath):
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"{base_filename}_{timestamp}.json"
        filepath = os.path.join(base_dir, filename)
    
    # Save to JSON
    with open(filepath, 'w') as f:
        json.dump(metrics, f, indent=4)
    print(f"Metrics saved to: {filepath}")
    
    return filepath

def load_metrics(filepath):
    """
    Load metrics from JSON file.
    
    Args:
        filepath: Path to the JSON file
    
    Returns:
        dict: Dictionary containing evaluation metrics
    """
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Metrics file not found: {filepath}")
    
    with open(filepath, 'r') as f:
        metrics = json.load(f)
    print(f"Loaded metrics from: {filepath}")
    
    return metrics

In [7]:
def tokenizer_function(examples):
    return tokenizer(examples['text'], truncation=True)   
# apply tokenizer function on your data
tokenized_data = dataset.map(tokenizer_function, batched=True)

# train = tokenized_data['train'].select(range(10000))
train = tokenized_data['train']
test = tokenized_data['test']

# Create pipeline
classifier = pipeline(
    task="sentiment-analysis", 
    model=model, 
    tokenizer=tokenizer, 
    device_map="auto", 
    padding=True, 
    truncation=True, 
    max_length=256
)

results = []
label_map = {0: "negative", 1: "neutral", 2: "positive"}

for text, out in zip(test['text'], classifier(KeyDataset(test, "text"), batch_size=16)):
    label_id = int(out['label'].split('_')[-1])
    sentiment = label_map[label_id]
    results.append({
        'text': text,
        'predicted_sentiment': sentiment
    })

Device set to use cuda:0


In [14]:
id = 6
print(results[id]['predicted_sentiment'])
print(test['label'][id])
print(test['label_text'][id])
print(test['text'][id])

negative
0
negative
I THINK EVERYONE HATES ME ON HERE   lol


In [11]:
results_filepath = save_analysis_results(results, model_name, data_name)

Results saved to: ./results/finetune_Llama-3.1-8B-Instruct_tweet.csv


In [15]:
metrics = evaluate_predictions(test, results)


=== Evaluation Results ===
Total samples: 3534
Correct predictions: 2880
Accuracy: 81.49%

=== Confusion Matrix ===
True|Pred      Negative  Neutral   Positive  
Negative       845       145       11        
Neutral        199       1096      135       
Positive       25        139       939       

=== Per-Class Metrics ===

Negative class:
Precision: 79.05%
Recall: 84.42%
F1-score: 81.64%

Neutral class:
Precision: 79.42%
Recall: 76.64%
F1-score: 78.01%

Positive class:
Precision: 86.54%
Recall: 85.13%
F1-score: 85.83%


In [19]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [20]:
model.push_to_hub("Finetune4Sentiment-Llama-3.1-8B")

model-00004-of-00004.safetensors:   0%|          | 0.00/117M [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Thomas-Chou/Finetune4Sentiment-Llama-3.1-8B/commit/87c60fa09ffb46d4fbf2c315e15ec76b6bc7f9c3', commit_message='Upload LlamaForSequenceClassification', commit_description='', oid='87c60fa09ffb46d4fbf2c315e15ec76b6bc7f9c3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Thomas-Chou/Finetune4Sentiment-Llama-3.1-8B', endpoint='https://huggingface.co', repo_type='model', repo_id='Thomas-Chou/Finetune4Sentiment-Llama-3.1-8B'), pr_revision=None, pr_num=None)

In [21]:
tokenizer.push_to_hub("Finetune4Sentiment-Llama-3.1-8B")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Thomas-Chou/Finetune4Sentiment-Llama-3.1-8B/commit/4534d9bbfc2bd781b7f11e2c17eec369263da5d3', commit_message='Upload tokenizer', commit_description='', oid='4534d9bbfc2bd781b7f11e2c17eec369263da5d3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Thomas-Chou/Finetune4Sentiment-Llama-3.1-8B', endpoint='https://huggingface.co', repo_type='model', repo_id='Thomas-Chou/Finetune4Sentiment-Llama-3.1-8B'), pr_revision=None, pr_num=None)

In [None]:
predictions = trainer.predict(test)
print(predictions.predictions.shape, predictions.label_ids.shape)

In [None]:
print(model.config.num_labels)  # Confirm matches actual label count
print(len(np.unique(train['label'])))  # Check actual unique label count

In [None]:
print(train[0].keys())  # Verify label column exists
print(train[1]['input_ids'])  # Confirm label format

In [None]:
tokens = classifier.tokenizer(["Example text", "I am a boy"], padding=True, truncation=True)
print(tokens)
print(classifier.tokenizer.pad_token)