In [None]:
!pip install datasets
from datasets import load_dataset

raw_datasets = load_dataset("ag_news")
raw_datasets


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [None]:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]

{'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.",
 'label': 2}

In [None]:
print(raw_train_dataset.features)

{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['World', 'Sports', 'Business', 'Sci/Tech'], id=None)}


In [None]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(batch):
    return tokenizer(
        batch["text"], truncation=True, padding=True, return_tensors="pt"
    )


tokenize_function(raw_train_dataset[:2])

{'input_ids': tensor([[  101,  2813,  2358,  1012,  6468, 15020,  2067,  2046,  1996,  2304,
          1006, 26665,  1007, 26665,  1011,  2460,  1011, 19041,  1010,  2813,
          2395,  1005,  1055,  1040, 11101,  2989,  1032,  2316,  1997, 11087,
          1011, 22330,  8713,  2015,  1010,  2024,  3773,  2665,  2153,  1012,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [  101, 18431,  2571,  3504,  2646,  3293, 13395,  1006, 26665,  1007,
         26665,  1011,  2797,  5211,  3813, 18431,  2571,  2177,  1010,  1032,
          2029,  2038,  1037,  5891,  2005,  2437,  2092,  1011, 22313,  1998,
          5681,  1032,  6801,  3248,  1999,  1996,  3639,  3068,  1010,  2038,
          5168,  2872,  1032,  2049, 29475,  2006,  2178,  2112,  1997,  1996,
          3006,  1012,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1]])}


In [None]:

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 7600
    })
})

In [None]:
!pip install evaluate
import evaluate

accuracy = evaluate.load("accuracy")
print(accuracy.description)
print(accuracy.compute(references=[0, 1, 0, 1], predictions=[1, 0, 0, 1]))


Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:
Accuracy = (TP + TN) / (TP + TN + FP + FN)
 Where:
TP: True positive
TN: True negative
FP: False positive
FN: False negative

{'accuracy': 0.5}

In [None]:
f1_score = evaluate.load("f1")


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Compute F1 score and accuracy
    f1 = f1_score.compute(
        references=labels, predictions=preds, average="weighted"
    )[
        "f1"
    ]
    acc = accuracy.compute(references=labels, predictions=preds)[
        "accuracy"
    ]

    return {"accuracy": acc, "f1": f1}

In [None]:
import torch
from transformers import AutoModelForSequenceClassification

device = "cuda" if torch.cuda.is_available() else "cpu"
num_labels = 4
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=num_labels
).to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

In [None]:
!pip install -U accelerate
!pip install -U transformers
from transformers import TrainingArguments

batch_size=32
training_args = TrainingArguments(
    "trainer-chapter4",
    num_train_epochs=2,
    evaluation_strategy="epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
)

In [None]:
from transformers import Trainer

shuffled_dataset = tokenized_datasets["train"].shuffle(seed=42)
small_split = shuffled_dataset.select(range(10000))

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=small_split,
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [None]:
#Trainer inner working summarize similar
from transformers import AdamW, get_scheduler

optimizer = AdamW(model.parameters(), lr=5e-5) 1
lr_scheduler = get_scheduler("linear", ...) 2

for epoch in range(num_epochs): 3
    for batch in train_dataloader: 4
        batch = {k: v.to(device) for k, v in batch.items()} 5
        outputs = model(**batch)
        loss = outputs.loss 6
        loss.backward()

        optimizer.step() 7
        lr_scheduler.step()
        optimizer.zero_grad()

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-classification", model="osanseviero/trainer-chapter4")
pipe(
    """The soccer match between Spain and
Portugal ended in a terrible result for Portugal."""
)

config.json: 100%
 807/807 [00:00<00:00, 11.6kB/s]
model.safetensors: 100%
 268M/268M [00:11<00:00, 23.9MB/s]
tokenizer_config.json: 100%
 1.20k/1.20k [00:00<00:00, 39.8kB/s]
vocab.txt: 100%
 232k/232k [00:00<00:00, 518kB/s]
tokenizer.json: 100%
 712k/712k [00:00<00:00, 1.62MB/s]
special_tokens_map.json: 100%
 125/125 [00:00<00:00, 3.80kB/s]
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
[{'label': 'LABEL_1', 'score': 0.9330371022224426}]

In [None]:
tokenized_datasets["test"].select([0, 1, 2])

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 3
})

In [None]:
# Run inference for all samples
trainer_preds = trainer.predict(tokenized_datasets["test"])

# Get the most likely class and the target label
preds = trainer_preds.predictions.argmax(-1)
references = trainer_preds.label_ids
label_names = raw_train_dataset.features["label"].names

In [None]:
# Print results of the first 3 samples
samples = 7
texts = tokenized_datasets["test"]["text"][:samples]

for pred, ref, text in zip(preds[:samples], references[:samples], texts):
    print(f"Predicted {pred}; Actual {ref}; Target name: {label_names[pred]}.")
    print(text)

Predicted 2; Actual 2; Target name: Business.
Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.
Predicted 3; Actual 3; Target name: Sci/Tech.
The Race is On: Second Private Team Sets Launch Date for Human Spaceflight (SPACE.com) SPACE.com - TORONTO, Canada -- A second\team of rocketeers competing for the  #36;10 million Ansari X Prize, a contest for\privately funded suborbital space flight, has officially announced the first\launch date for its manned rocket.
Predicted 3; Actual 3; Target name: Sci/Tech.
Ky. Company Wins Grant to Study Peptides (AP) AP - A company founded by a chemistry researcher at the University of Louisville won a grant to develop a method of producing better peptides, which are short chains of amino acids, the building blocks of proteins.
Predicted 3; Actual 3; Target name: Sci/Tech.
Prediction Unit Helps Forecast Wildfires (AP) AP - It's barely dawn when Mike Fitzpatrick starts his shift with a blur of colorful maps, figures and endless charts, but already he knows what the day will bring. Lightning will strike in places he expects. Winds will pick up, moist places will dry and flames will roar.
Predicted 3; Actual 3; Target name: Sci/Tech.
Calif. Aims to Limit Farm-Related Smog (AP) AP - Southern California's smog-fighting agency went after emissions of the bovine variety Friday, adopting the nation's first rules to reduce air pollution from dairy cow manure.
Predicted 3; Actual 3; Target name: Sci/Tech.
Open Letter Against British Copyright Indoctrination in Schools The British Department for Education and Skills (DfES) recently launched a "Music Manifesto" campaign, with the ostensible intention of educating the next generation of British musicians. Unfortunately, they also teamed up with the music industry (EMI, and various artists) to make this popular. EMI has apparently negotiated their end well, so that children in our schools will now be indoctrinated about the illegality of downloading music.The ignorance and audacity of this got to me a little, so I wrote an open letter to the DfES about it. Unfortunately, it's pedantic, as I suppose you have to be when writing to goverment representatives. But I hope you find it useful, and perhaps feel inspired to do something similar, if or when the same thing has happened in your area.
Predicted 3; Actual 3; Target name: Sci/Tech.
Loosing the War on Terrorism \\"Sven Jaschan, self-confessed author of the Netsky and Sasser viruses, is\responsible for 70 percent of virus infections in 2004, according to a six-month\virus roundup published Wednesday by antivirus company Sophos."\\"The 18-year-old Jaschan was taken into custody in Germany in May by police who\said he had admitted programming both the Netsky and Sasser worms, something\experts at Microsoft confirmed. (A Microsoft antivirus reward program led to the\teenager's arrest.) During the five months preceding Jaschan's capture, there\were at least 25 variants of Netsky and one of the port-scanning network worm\Sasser."\\"Graham Cluley, senior technology consultant at Sophos, said it was staggeri ...\\

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

confusion_matrix = evaluate.load("confusion_matrix")
cm = confusion_matrix.compute(
    references=references, predictions=preds, normalize="true"
)["confusion_matrix"]

fig, ax = plt.subplots(figsize=(6, 6))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_names)
disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)

plt.title("Normalized confusion matrix")
plt.show()

![alt text](confusion_mat.png)

In [None]:
## Generating business news

In [None]:
filtered_datasets = raw_datasets.filter(lambda example: example["label"] == 2)
filtered_datasets = filtered_datasets.remove_columns("label")

In [None]:
from transformers import AutoModelForCausalLM

model_id = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = (
    tokenizer.eos_token
)  # Needed as gpt2 does not specify padding token.
model = AutoModelForCausalLM.from_pretrained(model_id).to(device)

In [None]:
def tokenize_function(batch):
    return tokenizer(batch["text"], truncation=True)


tokenized_datasets = filtered_datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],  # We only need the input_ids and attention_mask
)

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 30000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1900
    })
})

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
samples = [tokenized_datasets["train"][i] for i in range(3)]

for sample in samples:
    print(f"input_ids shape: {len(sample['input_ids'])}")

input_ids shape: 37
input_ids shape: 55
input_ids shape: 51

In [None]:
out = data_collator(samples)
for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([3, 55])
attention_mask shape: torch.Size([3, 55])
labels shape: torch.Size([3, 55])

In [None]:
training_args = TrainingArguments(
    "sft_cml4",
    push_to_hub=True,
    per_device_train_batch_size=8,
    weight_decay=0.1,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    num_train_epochs=2,
    evaluation_strategy="steps",
    eval_steps=200,
    logging_steps=200,
)

In [None]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"].select(range(5000)),
    eval_dataset=tokenized_datasets["test"],
)

In [None]:
trainer.train()

Step	Training Loss	Validation Loss
200	3.659400	3.597305
400	3.310700	3.490597
600	3.098700	3.372308
800	2.144300	3.444029
1000	1.964900	3.415928
1200	1.910900	3.402368
TrainOutput(global_step=1250, training_loss=2.6508543701171874, metrics={'train_runtime': 431.686, 'train_samples_per_second': 23.165, 'train_steps_per_second': 2.896, 'total_flos': 467451445248000.0, 'train_loss': 2.6508543701171874, 'epoch': 2.0})

In [None]:
trainer.push_to_hub()

In [None]:
from transformers import pipeline

pipe = pipeline("text-generation", model="Rishabh-sucks-at-code/sft_cml4", device=device)
pipe.tokenizer.pad_token_id = 50256  # pad_token_id for gpt2
print(pipe("Q1", pad_token_id=tokenizer.eos_token_id)[0]["generated_text"])
print(pipe("Wall", pad_token_id=tokenizer.eos_token_id)[0]["generated_text"])
print(pipe("Google", pad_token_id=tokenizer.eos_token_id)[0]["generated_text"])

Q1 profit soars in Q2 as US bookies flourish A new spate of bookies and bookies reports the full year net profit for the year to June 30, helped by strong domestic bookies and strong results for August this year. The
Wall St. Seen Lower as Oil Prices Weighs  NEW YORK (Reuters) - Wall Street is seen lowering  shares of extremely risky mutual funds on Thursday, as a drop in oil prices threatens to  stun Wall Street's long-term growth
Google, S goes public on IPO NEW YORK, August 19 (New Ratings)  Google, the worlds most popular Internet search engine, announced its public shares registration today just hours after initial public offering.   The highly anticipated IPO was announced on Google

In [None]:
def print_wrapped(text):
    print("=" * 20)
    print(text)
    print("=" * 20)

In [None]:
print_wrapped(
    pipe("Q1", pad_token_id=tokenizer.eos_token_id)[0]["generated_text"]
)
print_wrapped(
    pipe("Wall", pad_token_id=tokenizer.eos_token_id)[0]["generated_text"]
)
print_wrapped(
    pipe("Google", pad_token_id=tokenizer.eos_token_id)[0]["generated_text"]
)


====================
Q1 profit beats forecasts The Australian Securities Exchange Commission (SXC) on Tuesday reported its first profit since 2003, buoyed by easing drug prices and strong sales from pharmacies.  quot;Success or failure? quot; The firm reported that first
====================
====================
Wall Street Is Set to Open Lower  NEW YORK (Reuters) - The Dow Jones Industrial Average  closed lower on Wednesday as investors were encouraged in the  latest week to return to the casino stock market and seek bargains  out of the Big Three
====================
====================
Google IPO Price Range Is Over \$85  NEW YORK/SAN FRANCISCO (Reuters) - Google Inc. &lt;A HREF="http://www.investor.reuters.com/FullQuote.aspx?t
====================

## Performace Efficient Finetuning

In [None]:
pip install peft

In [None]:
from peft import LoraConfig, get_peft_model

peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
    fan_in_fan_out=True,
)

model = AutoModelForCausalLM.from_pretrained("gpt2")
peft_model = get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()

trainable params: 294,912 || all params: 124,734,720 || trainable%: 0.2364

In [None]:
#Quantization

model = AutoModelForCausalLM.from_pretrained("gpt2", torch_dtype="float16")

In [None]:
import numpy as np


def scaling_factor(vector):
    m = np.max(np.abs(vector))
    return 127 / m


array = [1.2, -0.5, -4.3, 1.2, -3.1, 0.8, 2.4, 5.4]
alpha = scaling_factor(array)
quantized_array = np.round(alpha * np.array(array)).astype(np.int8)
dequantized_array = quantized_array / alpha

print(f"Scaling factor: {alpha}")
print(f"Quantized array: {quantized_array}")
print(f"Dequantized array: {dequantized_array}")
print(f"Difference: {array - dequantized_array}")   

Scaling factor: 23.518518518518515
Quantized array: [  28  -12 -101   28  -73   19   56  127]
Dequantized array: [ 1.19055118 -0.51023622 -4.29448819  1.19055118 -3.10393701  0.80787402
  2.38110236  5.4       ]
Difference: [ 0.00944882  0.01023622 -0.00551181  0.00944882  0.00393701 -0.00787402
  0.01889764  0.        ]

In [None]:
pip install -U bitsandbytes

In [None]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("gpt2", load_in_8bit=True)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.1",
    load_in_4bit=True,
    device_map="auto",
)

In [None]:
from trl import SFTTrainer

dataset = load_dataset("timdettmers/openassistant-guanaco", split="train")

peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)

training_args = TrainingArguments(
    "sft_cml5",
    push_to_hub=True,
    per_device_train_batch_size=8,
    weight_decay=0.1,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    num_train_epochs=2,
    evaluation_strategy="steps",
    eval_steps=200,
    logging_steps=200,
    gradient_checkpointing=True,
)

trainer = SFTTrainer(
    model,
    args=training_args,
    train_dataset=dataset.select(range(300)),
    dataset_text_field="text",
    peft_config=peft_config,
    max_seq_length=512,
)

trainer.train()

In [None]:
trainer.push_to_hub()

In [None]:
from peft import PeftModel

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.1",
    torch_dtype=torch.float16,
    device_map="auto",
)
model = PeftModel.from_pretrained(
    model,
    "osanseviero/sft_cml5",
    torch_dtype=torch.float16,
)
model = model.merge_and_unload()  # This is the main difference
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
pipe("### Human: Hello!###Assistant:", max_new_tokens=100)