In [1]:
import pandas as pd
import torch

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:

train_df = pd.read_csv('/kaggle/input/dataset/NLP_ass_train.tsv', sep='\t')
train_df.loc[len(train_df)] = train_df.columns
# add column headings , sentence and label
train_df.columns = ['sentence', 'label']


# load the test set 
test_df = pd.read_csv('/kaggle/input/dataset/NLP_ass_test.tsv',sep='\t')
test_df.loc[len(test_df)] = test_df.columns
# add column headings , sentence and label
test_df.columns = ['sentence', 'label']

# load the validation set
valid_df = pd.read_csv('/kaggle/input/dataset/NLP_ass_valid.tsv',sep='\t')
valid_df.loc[len(valid_df)] = valid_df.columns
# add column headings , sentence and label
valid_df.columns = ['sentence', 'label']





## Zero Shot Prompting

In [4]:
# loading the Flan-T5 small model 

from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small", device_map="auto")

input_text = '''classify the following statement as 'normal', 'hatespeech' or 'offensive' : '''
# input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

# outputs = model.generate(input_ids)
# print(tokenizer.decode(outputs[0]))

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [5]:
# append the test set to the input text and predict the labels

predictions = []

for i in range(len(test_df)):
    input = input_text + test_df['sentence'][i]
    input_ids = tokenizer(input, return_tensors="pt").input_ids.to(device)
    outputs = model.generate(input_ids)
    predictions.append(tokenizer.decode(outputs[0]))
    
    # the prediction is of the format : <pad> prediction</s>
    # we need to extract the prediction from the string
    predictions[i] = predictions[i].split(' ')[1]
    predictions[i] = predictions[i].split('<')[0]
    predictions[i] = predictions[i].lower()

    # print(predictions[i],test_df['label'][i])
    
    



In [6]:
# calculate the accuracy of the model
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(test_df['label'], predictions)
print("Accuracy: ", accuracy)

# calculate macro f1 score
from sklearn.metrics import f1_score

f1 = f1_score(test_df['label'], predictions, average='macro')
print("Macro F1 Score: ", f1)

Accuracy:  0.27598752598752596
Macro F1 Score:  0.0526278764666512


In [7]:
del model

In [8]:
# loading the Flan-T5 base model 

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base", device_map="auto")

input_text = '''classify the following statement as 'normal', 'hatespeech' or 'offensive' : '''
# input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

# outputs = model.generate(input_ids)
# print(tokenizer.decode(outputs[0]))

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [9]:
# append the test set to the input text and predict the labels

predictions = []

for i in range(len(test_df)):
    input = input_text + test_df['sentence'][i]
    input_ids = tokenizer(input, return_tensors="pt").input_ids.to(device)
    outputs = model.generate(input_ids)
    predictions.append(tokenizer.decode(outputs[0]))
    
    # the prediction is of the format : <pad> prediction</s>
    # we need to extract the prediction from the string
    predictions[i] = predictions[i].split(' ')[1]
    predictions[i] = predictions[i].split('<')[0]
    predictions[i] = predictions[i].lower()

    # print(predictions[i],test_df['label'][i])
    
    



In [10]:
# calculate the accuracy of the model
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(test_df['label'], predictions)
print("Accuracy: ", accuracy)

# calculate macro f1 score
from sklearn.metrics import f1_score

f1 = f1_score(test_df['label'], predictions, average='macro')
print("Macro F1 Score: ", f1)

Accuracy:  0.5171517671517671
Macro F1 Score:  0.47239571197242197


In [11]:
del model

## Few Shot Prompting

In [12]:
# loading the Flan-T5 small model 

from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small", device_map="auto")

input_text = '''classify the following sentence as offensive, hatespeech, normal
Sentence: 'bitches be so fine and got the nerve to like niggas 😒'
offensive
Sentence: 'why does anyone care what the albino nigger has to say'
hatespeech
Sentence: 'if we could teach a giraffe to paint that would be one handy sumbitch'
normal
Sentence:
'''
# input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

# outputs = model.generate(input_ids)
# print(tokenizer.decode(outputs[0]))



In [13]:
# append the test set to the input text and predict the labels

predictions = []

for i in range(len(test_df)):
    input = input_text + test_df['sentence'][i]
    input_ids = tokenizer(input, return_tensors="pt").input_ids.to(device)
    outputs = model.generate(input_ids)
    predictions.append(tokenizer.decode(outputs[0]))
    
    # the prediction is of the format : <pad> prediction</s>
    # we need to extract the prediction from the string
    predictions[i] = predictions[i].split(' ')[1]
    predictions[i] = predictions[i].split('<')[0]
    predictions[i] = predictions[i].lower()

    # print(predictions[i],test_df['label'][i])
    
    



In [14]:
# calculate the accuracy of the model
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(test_df['label'], predictions)
print("Accuracy: ", accuracy)

# calculate macro f1 score
from sklearn.metrics import f1_score

f1 = f1_score(test_df['label'], predictions, average='macro')
print("Macro F1 Score: ", f1)

Accuracy:  0.30457380457380456
Macro F1 Score:  0.0756613751289288


In [15]:
del model

In [16]:
# loading the Flan-T5 base model 

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base", device_map="auto")

input_text = '''classify the following sentence as offensive, hatespeech, normal
Sentence: 'bitches be so fine and got the nerve to like niggas 😒'
offensive
Sentence: 'why does anyone care what the albino nigger has to say'
hatespeech
Sentence: 'if we could teach a giraffe to paint that would be one handy sumbitch'
normal
Sentence:
'''
# input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

# outputs = model.generate(input_ids)
# print(tokenizer.decode(outputs[0]))



In [17]:
# append the test set to the input text and predict the labels

predictions = []

for i in range(len(test_df)):
    input = input_text + test_df['sentence'][i]
    input_ids = tokenizer(input, return_tensors="pt").input_ids.to(device)
    outputs = model.generate(input_ids)
    predictions.append(tokenizer.decode(outputs[0]))
    
    # the prediction is of the format : <pad> prediction</s>
    # we need to extract the prediction from the string
    predictions[i] = predictions[i].split(' ')[1]
    predictions[i] = predictions[i].split('<')[0]
    predictions[i] = predictions[i].lower()

    # print(predictions[i],test_df['label'][i])
    
    



In [18]:
# calculate the accuracy of the model
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(test_df['label'], predictions)
print("Accuracy: ", accuracy)

# calculate macro f1 score
from sklearn.metrics import f1_score

f1 = f1_score(test_df['label'], predictions, average='macro')
print("Macro F1 Score: ", f1)

Accuracy:  0.48492723492723494
Macro F1 Score:  0.22943917593198168


In [19]:
del model