In [1]:
!pip install bitsandbytes
!pip install accelerate
!pip install --upgrade transformers
!pip install --upgrade peft
!pip install --upgrade datasets
!pip install evaluate


Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.44.1
Collecting transformers
  Downloading transformers-4.46.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.46.1-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m73.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.45.1
    Uninstalling transformers-4.45.1:
      Successf

In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig
import torch
import pandas as pd
import numpy as np

from transformers import Trainer, TrainingArguments
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from peft import get_peft_model, LoraConfig, TaskType

In [3]:
train_df = pd.read_csv('/kaggle/input/dataset/NLP_ass_train.tsv', sep='\t')
train_df.loc[len(train_df)] = train_df.columns
# add column headings , sentence and label
train_df.columns = ['sentence', 'labels']


# load the test set 
test_df = pd.read_csv('/kaggle/input/dataset/NLP_ass_test.tsv',sep='\t')
test_df.loc[len(test_df)] = test_df.columns
# add column headings , sentence and label
test_df.columns = ['sentence', 'labels']

# load the validation set
valid_df = pd.read_csv('/kaggle/input/dataset/NLP_ass_valid.tsv',sep='\t')
valid_df.loc[len(valid_df)] = valid_df.columns
# add column headings , sentence and label
valid_df.columns = ['sentence', 'labels']

# label_mapping = {
#     'normal': 0,
#     'hatespeech': 1,
#     'offensive': 2
# }

# # Apply the mapping to the 'labels' column in train_df
# train_df['labels'] = train_df['labels'].map(label_mapping)
# test_df['labels'] = test_df['labels'].map(label_mapping)
# valid_df['labels'] = valid_df['labels'].map(label_mapping)

In [4]:
def intersection(df1,df2,col):
    """
    This function takes two dataframes and a column name as input and returns the size of the intersetion of the two dataframes under the given column name.
    """
    
    t1 = df1[col].tolist()
    t2 = df2[col].tolist()
    
    return len(set(t1).intersection(set(t2)))

In [5]:
# the number of samples common to both the train and test set 
intersection(test_df,train_df,"sentence")

5

In [6]:
from datasets import Dataset
train_df = Dataset.from_pandas(train_df)
test_df = Dataset.from_pandas(test_df)
valid_df = Dataset.from_pandas(valid_df)


In [7]:
# data preprocessing
def format_dataset(data_point):
    prompt = f"""###SYSTEM: Classify the sentence as normal, hatespeech, or offensive.
###Sentence: {data_point['sentence'].lower()}
"""
    
    tokens = tokenizer(prompt,
        truncation=True,
        max_length=256,
        padding="max_length"
    )
    
    labels = tokenizer(data_point['labels'],
        truncation=True,
        max_length=256
    )
    tokens['labels'] = labels['input_ids']
    
    return tokens

In [8]:
# initialisig the tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base", padding_side="right",)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [9]:
train_dataset = train_df.map(format_dataset)
test_dataset = test_df
valid_dataset = valid_df.map(format_dataset)

Map:   0%|          | 0/15383 [00:00<?, ? examples/s]

Map:   0%|          | 0/1922 [00:00<?, ? examples/s]

In [10]:
print(train_dataset[0].keys())
train_dataset = train_dataset.remove_columns(['sentence', "__index_level_0__"])
print(train_dataset[0].keys())
# test_dataset = test_dataset.remove_columns(['sentence', "__index_level_0__"])
valid_dataset = valid_dataset.remove_columns(['sentence', "__index_level_0__"])

dict_keys(['sentence', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'])
dict_keys(['labels', 'input_ids', 'attention_mask'])


In [11]:
# setting up the model

model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", device_map="auto")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [12]:
# setting the lora parameters for the model


model.gradient_checkpointing_enable()

peft_config = LoraConfig(inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1, peft_type="SEQ_CLS")
model = get_peft_model(model, peft_config)

print(model.print_trainable_parameters())

trainable params: 884,736 || all params: 248,462,592 || trainable%: 0.3561
None


In [13]:
# parallelizing
import torch
if torch.cuda.device_count() > 1: 
    model.is_parallelizable = True
    model.model_parallel = True

In [14]:
import nltk
import evaluate
nltk.download("punkt", quiet=True)
metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [15]:
# def compute_metrics(eval_preds):
#    preds, labels = eval_preds

#    # decode preds and labels
#    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
#    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True,max_length=1024)
#    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True,max_length=1024)

#    # rougeLSum expects newline after each sentence
#    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
#    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

#    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
  
#    return result

In [16]:
# setting the training parameters
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
L_RATE = 1e-3
BATCH_SIZE = 32
PER_DEVICE_EVAL_BATCH = 32
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 1
NUM_EPOCHS = 4

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir="output",
   eval_strategy="epoch",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
    report_to=[]
)

In [17]:
# initialising the trainer
trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=train_dataset,
   eval_dataset=valid_dataset,
#    tokenizer=tokenizer,
   data_collator=data_collator,
#    compute_metrics=compute_metrics
)

In [18]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss
1,No log,No log
2,0.342500,No log
3,0.276100,No log
4,0.258800,No log


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


TrainOutput(global_step=1924, training_loss=0.28224581442850794, metrics={'train_runtime': 4122.1706, 'train_samples_per_second': 14.927, 'train_steps_per_second': 0.467, 'total_flos': 2.115086389346304e+16, 'train_loss': 0.28224581442850794, 'epoch': 4.0})

In [19]:
# make prediction with the model and report accuracy and macro f1 score
from sklearn.metrics import accuracy_score, f1_score
prediction = []
for sample in test_dataset:
    txt = f"""###SYSTEM: Classify the sentence as normal, hatespeech or offensive.
###Sentence: {sample['sentence']}
"""
#     print(txt)
    tokens = tokenizer(txt, return_tensors="pt")['input_ids'].to(model.device)
    output = model.generate(tokens, max_length=100)
    print(tokenizer.decode(output[0]))
    sentence = tokenizer.decode(output[0])
    
    # the sentence is of the form <pad> {label}</s>
    # we need to extract the label
    label = sentence.split()[1]
    # remove the </s> token
    label = label[:-4]
#     print(label)
    prediction.append(label.lower())
    
    

    



<pad> hatespeech</s>
<pad> normal</s>
<pad> hatespeech</s>
<pad> normal</s>
<pad> normal</s>
<pad> normal</s>
<pad> hatespeech</s>
<pad> normal</s>
<pad> offensive</s>
<pad> normal</s>
<pad> normal</s>
<pad> offensive</s>
<pad> hatespeech</s>
<pad> hatespeech</s>
<pad> normal</s>
<pad> offensive</s>
<pad> normal</s>
<pad> normal</s>
<pad> hatespeech</s>
<pad> hatespeech</s>
<pad> hatespeech</s>
<pad> normal</s>
<pad> hatespeech</s>
<pad> hatespeech</s>
<pad> hatespeech</s>
<pad> hatespeech</s>
<pad> normal</s>
<pad> hatespeech</s>
<pad> hatespeech</s>
<pad> normal</s>
<pad> hatespeech</s>
<pad> normal</s>
<pad> hatespeech</s>
<pad> normal</s>
<pad> normal</s>
<pad> normal</s>
<pad> normal</s>
<pad> offensive</s>
<pad> normal</s>
<pad> offensive</s>
<pad> normal</s>
<pad> normal</s>
<pad> hatespeech</s>
<pad> normal</s>
<pad> normal</s>
<pad> hatespeech</s>
<pad> normal</s>
<pad> normal</s>
<pad> hatespeech</s>
<pad> hatespeech</s>
<pad> hatespeech</s>
<pad> hatespeech</s>
<pad> normal<

In [20]:
# calculating accurave and F1 score
accuracy = accuracy_score(test_df['labels'], prediction)
f1 = f1_score(test_df['labels'], prediction, average='macro')

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

Accuracy: 0.6886694386694386
F1 Score: 0.6681422176308539
