In [4]:
import os
import numpy as np
import pandas as pd
import sklearn
import tensorflow as tf
import matplotlib.pyplot as plt
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

# Check for TensorFlow GPU access
print(f"TensorFlow has access to the following devices:\n{tf.config.list_physical_devices()}")

# See TensorFlow version
print(f"TensorFlow version: {tf.__version__}")

TensorFlow has access to the following devices:
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
TensorFlow version: 2.13.0


In [None]:
# !cd "/Users/rubenmathew/Desktop/UTD/CS 4485/CSProject.nosync/ML/llama2/llama.cpp"
# !conda activate MLTA
# !./main -m ./models/7B/ggml-model-q4_0.bin -n 1024 --repeat_penalty 1.0 --color -i -r "Student:" -f ./prompts/ta-chat.txt
# !./main -m ./models/13B/ggml-model-q4_0.bin -n 1024 --temp 0.5 -s 1337 --repeat_penalty 1.1 --keep 130 --color -i -r "Student:" -f ./prompts/ta-chat.txt


#### Test Zero-Shot Classifier

In [1]:
import accelerate
from transformers import pipeline

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

text_to_classify = "how much of my grade is the final?"
candidate_labels = ["Sorting", "Searching", "Design", "Data Structures", "Proofs", "Asymptomatic analysis", "Dynamic Programming", "Greedy Methods", "Grading", "Course Description", "Class Time", "Assignment Details", "Class Location", "Professor Contact Info"]
syllabus_labels = ["Grading", "Course Description", "Assignment Details", "Class Time", "Class Location", "Professor Contact Info"]

classifier(text_to_classify, candidate_labels)


{'sequence': 'how much of my grade is the final?',
 'labels': ['Grading',
  'Asymptomatic analysis',
  'Proofs',
  'Assignment Details',
  'Greedy Methods',
  'Design',
  'Sorting',
  'Professor Contact Info',
  'Searching',
  'Class Location',
  'Course Description',
  'Data Structures',
  'Dynamic Programming',
  'Class Time'],
 'scores': [0.5714775919914246,
  0.07328278571367264,
  0.03942479193210602,
  0.036618027836084366,
  0.03529990464448929,
  0.032818250358104706,
  0.030507437884807587,
  0.030034111812710762,
  0.02965698204934597,
  0.02943132072687149,
  0.027783405035734177,
  0.022871626541018486,
  0.02200988680124283,
  0.0187838152050972]}

### Quantize the model

In [None]:
import accelerate
from transformers import pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-mnli")
gptq_config = GPTQConfig(bits=4, dataset = "c4", tokenizer=tokenizer)

text_to_classify = "how much of my grade is the final?"
candidate_labels = ["Sorting", "Searching", "Design", "Data Structures", "Proofs", "Asymptomatic analysis", "Dynamic Programming", "Greedy Methods", "Grading", "Course Description", "Class Time", "Assignment Details", "Class Location", "Professor Contact Info"]
syllabus_labels = ["Grading", "Course Description", "Assignment Details", "Class Time", "Class Location", "Professor Contact Info"]

classifier(text_to_classify, candidate_labels)

# New Attempt

In [2]:
from datasets import load_dataset
from sentence_transformers.losses import CosineSimilarityLoss

from setfit import SetFitModel, SetFitTrainer, sample_dataset


# Load a dataset from the Hugging Face Hub
dataset = load_dataset("csv", data_files="./mock-data/classification.csv")

# Simulate the few-shot regime by sampling 8 examples per class
train_dataset = sample_dataset(dataset["train"], label_column="class", num_samples=8)
eval_dataset = dataset["validation"]

exit()

# Load a SetFit model from Hub
model = SetFitModel.from_pretrained("facebook/bart-large-mnli")

# Create trainer
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss_class=CosineSimilarityLoss,
    metric="accuracy",
    batch_size=16,
    num_iterations=20, # The number of text pairs to generate for contrastive learning
    num_epochs=1, # The number of epochs to use for contrastive learning
    column_mapping={"sentence": "text", "label": "label"} # Map dataset columns to text/label expected by trainer
)

# Train and evaluate
trainer.train()
metrics = trainer.evaluate()

# Push model to the Hub
# trainer.push_to_hub("my-awesome-setfit-model")

# Download from Hub and run inference
# model = SetFitModel.from_pretrained("lewtun/my-awesome-setfit-model")
# Run inference
# preds = model(["i loved the spiderman movie!", "pineapple on pizza is the worst 🤮"])

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

KeyError: 'validation'

## Fine-Tune Zero-Shot Classifier

### Functions

In [3]:
import pandas as pd
import torch
from datasets import Dataset, load_metric
import random


def shuffle_df(old_df: pd.DataFrame, cycles: int = 1) -> pd.DataFrame:
  for i in range(cycles):
    new_df = old_df.sample(frac=1).reset_index(drop=True)
  return new_df

### Set Up Data

In [None]:
# torch.cuda.empty_cache() # You may use this command to clear your cache
# torch.cuda.is_available() # You may use this command to check if you have gpu or not

org_df = pd.read_csv("./mock-data/classification.csv")

# Shuffle the data
df = shuffle_df(org_df, 100)
print(df)

# Split the data into train and test portions
train_percentage = 0.8
train_portion = int(train_percentage * len(df))
test_portion = len(df) - train_portion

df_train = df.head(train_portion)
df_test = df.tail(test_portion)


# Convert to Dataset objects
train_ds = Dataset.from_pandas(df_train, split="train")
test_ds = Dataset.from_pandas(df_test, split="test")

label_to_int = {(k,v) for v,k in enumerate(candidate_labels)}
template = "{}"

### Tokenizer

In [None]:
from transformers import BartTokenizerFast

tokenizer = BartTokenizerFast.from_pretrained('facebook/bart-large-mnli')

def create_input_sequence(sample):
	text = sample["text"]
	label = sample["class"][0]
	contradiction_label = random.choice([x for x in label_to_int if x != label])
	encoded_sequence = tokenizer(text * 2, [template.format(label), template.format(contradiction_label)], truncation = True, padding = 'max_length')
	encoded_sequence["labels"] = [2, 0]
	encoded_sequence["input_sentence"] = tokenizer.batch_decode(encoded_sequence.input_ids)
	return encoded_sequence


train_dataset = train_ds.map(create_input_sequence, batched = True, batch_size = 1, remove_columns = ["class", "text"])
test_dataset = test_ds.map(create_input_sequence, batched = True, batch_size = 1, remove_columns = ["class", "text"])



### Training

In [None]:
from transformers import BartForSequenceClassification, Trainer, TrainingArguments, EvalPrediction
import numpy as np

def compute_metrics(p: EvalPrediction):
	metric_acc = load_metric("accuracy")
	metric_f1 = load_metric("f1")
	preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
	preds = np.argmax(preds, axis = 1)
	result = {}
	result["accuracy"] = metric_acc.compute(predictions = preds, references = p.label_ids)["accuracy"]
	result["f1"] = metric_f1.compute(predictions = preds, references = p.label_ids, average = 'macro')["f1"]
	return result

training_args = TrainingArguments(
	output_dir = "./ZSC_Models",      # Output directory
	num_train_epochs = 32,             # Total number of training epochs
	per_device_train_batch_size = 16,  # Batch size per device during training
	per_device_eval_batch_size = 64,   # Batch size for evaluation
	warmup_steps = 500,                # Number of warmup steps for learning rate scheduler
	weight_decay = 0.01,               # Strength of weight decay
)

model = BartForSequenceClassification.from_pretrained("facebook/bart-large-mnli", num_labels = len(label_to_int), ignore_mismatched_sizes = True)

trainer = Trainer(
	model = model,                     # The instantiated model to be trained
	args = training_args,              # Training arguments, defined above
	compute_metrics = compute_metrics, # A function to compute the metrics
	train_dataset = train_dataset,     # Training dataset
	eval_dataset = test_dataset,       # Evaluation dataset
	tokenizer = tokenizer              # The tokenizer that was used
)

trainer.train()
trainer.evaluate()