## Detecting hate content in songs

I am fine tuning a pretrained model for detecting hate speech.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# install dependancies
pip install transformers

In [None]:
# import dependancies
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics import precision_recall_fscore_support, accuracy_score


## Create Dataset

In [None]:
# I am using this open source data for fine tuning pretrained model
train = pd.read_csv('/content/drive/MyDrive/research/HackFS/train_comment.csv')

In [None]:
Dataset_train_original = train[[ "comment_text"  , "toxic"]]

In [None]:
# printing dataset
Dataset_train_original.head(100)

In [None]:
# take sample train and test data from the original data for fine-tuning
Dataset_train = Dataset_train_original.head(5000)  # sample data of 5000 rows
Dataset_test = Dataset_train_original.tail(1000)  # sample data of 1000 rows

In [None]:
Dataset_train

In [None]:
# renaming the column of dataset
Dataset_test = Dataset_test.rename(columns={'toxic': 'target'})
Dataset_train = Dataset_train.rename(columns={'toxic': 'target'})

In [None]:
Dataset_train

### Pre-process dataset

Now encode the dataset

In [None]:
# Define tokenizer and model.
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

# define tokenizer and model
# I am using "toxic-bert" as pretrained model
tokenizer = AutoTokenizer.from_pretrained("unitary/toxic-bert")
model = TFAutoModelForSequenceClassification.from_pretrained("unitary/toxic-bert" , from_pt = True)

In [None]:
# Define label
labels = np.array(Dataset_train["target"])

num_classes = 6  # Total number of classes
labels = tf.one_hot(labels, num_classes)  # Convert labels to one-hot encoded vectors

In [None]:
# label shape
print(labels.shape)

(5000, 6)


In [None]:
# Tokenize the training data input
def encode_code(comment_text):
    inputs = tokenizer(comment_text, padding=True, truncation=True, max_length=256, return_tensors='tf')
    outputs = model(inputs)[0]
    return outputs.numpy()

code_train = np.vstack(Dataset_train['comment_text'].apply(encode_code))

In [None]:
code_train.shape

(5000, 6)

In [None]:
# convert values to integer
code_train = tf.cast(code_train , tf.int32)

### Compile and train the model

In [None]:
from tensorflow.keras.optimizers import Adam

# Compile our model
# Lower learning rates are often better for fine-tuning transformers
model.compile(optimizer=tf.keras.optimizers.Adam(3e-5),
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
# training the model
model.fit(code_train, labels , epochs = 5 , batch_size= 32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f8d0579ae30>

### Evaluating

In [None]:
# Tokenize the testing data input
code_test = np.vstack(Dataset_test['comment_text'].apply(encode_code))
code_test = tf.cast(code_test , tf.int32)
code_test

In [None]:
# define testing label
labels_test = np.array(Dataset_test["target"])
len(labels_test)

In [None]:
# Prediction of testing data
result = model.predict(code_test)
predicted_labels = np.argmax(result.logits, axis=1)  # taking value with highiger probability



In [None]:
len(predicted_labels)

1000

In [None]:
predictions = predicted_labels
labels =  labels_test
# printing accuracy and other parameters
precision, recall, f1score = precision_recall_fscore_support(labels, predictions,average='macro')[:3]
acc = accuracy_score(labels, predictions)
print(f'acc: {acc}, precision: {precision}, recall: {recall}, f1score: {f1score}')

acc: 0.713, precision: 0.5526720070229343, recall: 0.6499999999999999, f1score: 0.5328464301817016


### Speech to text input

In [None]:
import json
import requests
from transformers import pipeline
API_TOKEN = "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"

headers = {"Authorization": f"Bearer {API_TOKEN}"}

# using pretrained model
API_URL = "https://api-inference.huggingface.co/models/facebook/wav2vec2-base-960h"

def query(filename):
    with open(filename, "rb") as f:
        data = f.read()
    response = requests.request("POST", API_URL, headers=headers, data=data)
    return json.loads(response.content.decode("utf-8"))


In [None]:
# song = requests.get("https://www.macaronisoup.com/songs/mp3/BINGO.mp3")
# file = open("BINGO.mp3", "wb")
# for chunk in song.iter_content(100000):
#   file.write(chunk)
# file.close()

In [None]:
url = "content/BINGO.mp3"

In [None]:
data = query("URL")
with open("URL", "rb") as f:
  data = f.read()

pipe = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h")

input = pipe("URL")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
input  = [input["text"]]
input

{'text': 'THERE WAS A FARMER HAD ADON ND BIGOE WAS HIS NAY U B A AN GT U A AN GY U B A ANGY UAN BINGOE WAS HIS AY MO B AY AND G O THERE WAS A BARBER'}

### Result

In [None]:
input = pd.DataFrame(input)
input = input.rename(columns={0: 'comment_text'})
# tokenize the input song
input = np.vstack(input['comment_text'].apply(encode_code))
input = tf.cast(input , tf.int32)
input.shape

In [None]:
# predict output
output = model.predict(input)
output = np.argmax(output.logits, axis=1)



In [None]:
# if output is 1 then it is toxic other wise it is non-toxic
if output[0] == 1:
  print("toxic")
else:
  print("Non-toxic")