In [1]:
%%capture
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# !pip install --upgrade --force-reinstall --no-cache-dir git+https://github.com/unslothai/unsloth.git

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Mistral-Nemo-Base-2407-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = True,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.12.2: Fast Mistral patching. Transformers:4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/165k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.31G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/177k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.26M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.12.2 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.


In [12]:
from datasets import load_dataset
from datasets import DatasetDict, Dataset
import pandas as pd

dataset = load_dataset('csv', data_files='/content/drive/MyDrive/Task_B/task_B_train.csv', split="train")
dataset = DatasetDict({
    'train': dataset
})

tweets = load_dataset('csv', data_files='/content/drive/MyDrive/Task_B/task_B_index_2_tweet.csv', split="train")
labels = load_dataset('csv', data_files='/content/drive/MyDrive/Task_B/task_B_index_2_label.csv', split="train")

tweets_df = pd.DataFrame(tweets)
labels_df = pd.DataFrame(labels)

merged_df = pd.merge(tweets_df, labels_df, on='index')

valid_data = Dataset.from_pandas(merged_df)
valid_data = DatasetDict({
    'valid': valid_data
})

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [13]:
train_data = dataset["train"]
train_data

Dataset({
    features: ['index', 'tweet', 'label'],
    num_rows: 19019
})

In [17]:
valid_data = valid_data['valid']
valid_data

Dataset({
    features: ['index', 'tweet', 'label'],
    num_rows: 4076
})

In [18]:
alpaca_prompt = """Below is an instruction that describes the task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Given the input text, the goal is to identify whether it contains hate speech or not. Return 1 if there is hate speech in the text. Else return 0.

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
  inputs = examples["tweet"]
  outputs = examples["label"]
  texts = []
  for input, output in zip(inputs, outputs):
      # Must add EOS_TOKEN, otherwise your generation will go on forever!
      text = alpaca_prompt.format(input, output) + EOS_TOKEN
      texts.append(text)

  return { "text" : texts, }

In [19]:
formatted_train_data = train_data.map(formatting_prompts_func, batched=True)
formatted_valid_data = valid_data.map(formatting_prompts_func, batched=True)

Map:   0%|          | 0/4076 [00:00<?, ? examples/s]

In [20]:
formatted_train_data

Dataset({
    features: ['index', 'tweet', 'label', 'text'],
    num_rows: 19019
})

In [22]:
formatted_valid_data

Dataset({
    features: ['index', 'tweet', 'label', 'text'],
    num_rows: 4076
})

In [23]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = formatted_train_data,
    eval_dataset = formatted_valid_data,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        resume_from_checkpoint=True,
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # max_steps = 1,
        num_train_epochs = 2,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "/content/drive/MyDrive/Task_B/outputs_latest_Nemo",
        save_steps=700,       # Save the model checkpoint every logging step
        save_total_limit=10,
        # eval_steps=200,               # Evaluate and save checkpoints every 10 steps
        # do_eval=True                 # Perform evaluation at the end of training
    ),
)

Map (num_proc=2):   0%|          | 0/19019 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/4076 [00:00<?, ? examples/s]

In [None]:
trainer_stats = trainer.train()
model.save_pretrained("/content/drive/MyDrive/Task_B/Nemo_task_B_2_epochs") # Local saving
tokenizer.save_pretrained("/content/drive/MyDrive/Task_B/Nemo_task_B_2_epochs")

  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 19,019 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 4,754
 "-____-"     Number of trainable parameters = 57,016,320
  checkpoint_rng_state = torch.load(rng_file)


Step,Training Loss
3501,1.1
3502,0.9837
3503,1.0801
3504,0.8906
3505,0.8108
3506,0.8108
3507,1.1695
3508,0.8788
3509,0.8307
3510,0.8459


Step,Training Loss
3501,1.1
3502,0.9837
3503,1.0801
3504,0.8906
3505,0.8108
3506,0.8108
3507,1.1695
3508,0.8788
3509,0.8307
3510,0.8459


('/content/drive/MyDrive/Task_B/Nemo_task_B_2_epochs_final/tokenizer_config.json',
 '/content/drive/MyDrive/Task_B/Nemo_task_B_2_epochs_final/special_tokens_map.json',
 '/content/drive/MyDrive/Task_B/Nemo_task_B_2_epochs_final/tokenizer.json')

In [None]:
from unsloth import FastLanguageModel
import torch
model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "/content/drive/MyDrive/Task_B/Nemo_task_B_2_epochs", # YOUR MODEL YOU USED FOR TRAINING
        # model_head_file=None,
        # local_files_only=True,
        max_seq_length = 2048,
        dtype = None,
        load_in_4bit = True,
    )

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.10.0: Fast Mistral patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/165k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.31G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

Unsloth 2024.10.0 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.


In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        formatted_valid_data["tweet"][3], # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs)
res = tokenizer.decode(outputs[0])
res
pred = res.split(tokenizer.eos_token)[0][-1]
pred

'<s>Below is an instruction that describes the task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nGiven the input text, the goal is to identify whether it contains hate speech or not. Return 1 if there is hate speech in the text. Else return 0.\n\n### Input:\n#बूथ_चैकिंग\n#CO_Sadar द्वारा आगामी #AssemblyElections2022 के दृष्टिगत महाराजपुर थानाक्षेत्र के अन्तर्गत क्रिटिकल मतदान केन्द्रों का भौतिक सत्यापन कर मूलभूत सुविधाओं का निरीक्षण किया गया तथा सम्बन्धित को आवश्यक दिशा-निर्देश दिये गये।#AgelessDemocracy #YourVoteMatters https://t.co/5ZuM6yCq1b\n\n### Response:\n0</s>'

In [None]:
import json
FastLanguageModel.for_inference(model)
with open('/content/drive/MyDrive/Task_B/Nemo_inferencing_task_B.txt', 'w') as f:
  box = {}
  for i in range(0, len(formatted_valid_data)):
    print(f"Data {i}")
    inputs = tokenizer([alpaca_prompt.format(
          formatted_valid_data["tweet"][i], # input
          "", # output - leave this blank for generation!
    )], return_tensors = "pt").to("cuda")

    outputs = model.generate(**inputs)
    res = tokenizer.decode(outputs[0])
    pred = res.split(tokenizer.eos_token)[0][-1]
    # print(f"Pred is {pred}")
    # print(f"Input: {formatted_valid_data['tweet'][i]}")
    # print(f"Pred: {pred}")
    # print(f"Label: {formatted_valid_data['label'][i]}")
    print(str(formatted_valid_data["label"][i])==pred)
    box['input'] = formatted_valid_data['tweet'][i]
    box['pred'] = pred
    box['label'] = formatted_valid_data['label'][i]
    box['correct'] = pred==formatted_valid_data['label'][i]

    f.write(json.dumps(box))
    f.write(",")
    # f.write(f"Input: {formatted_valid_data[""]}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Data 1576
True
Data 1577
True
Data 1578
True
Data 1579
True
Data 1580
True
Data 1581
True
Data 1582
True
Data 1583
True
Data 1584
True
Data 1585
True
Data 1586
True
Data 1587
True
Data 1588
True
Data 1589
True
Data 1590
True
Data 1591
True
Data 1592
True
Data 1593
True
Data 1594
True
Data 1595
False
Data 1596
True
Data 1597
True
Data 1598
True
Data 1599
True
Data 1600
True
Data 1601
True
Data 1602
True
Data 1603
True
Data 1604
True
Data 1605
True
Data 1606
True
Data 1607
True
Data 1608
True
Data 1609
True
Data 1610
True
Data 1611
True
Data 1612
False
Data 1613
True
Data 1614
True
Data 1615
True
Data 1616
True
Data 1617
False
Data 1618
True
Data 1619
True
Data 1620
True
Data 1621
True
Data 1622
True
Data 1623
True
Data 1624
True
Data 1625
True
Data 1626
True
Data 1627
True
Data 1628
True
Data 1629
True
Data 1630
True
Data 1631
True
Data 1632
True
Data 1633
True
Data 1634
True
Data 1635
True
Data 1636
True
Data 1637
True
Da