In [1]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.3.0 spark-nlp==4.2.8
! pip install Unidecode
! pip install transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.3/281.3 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m453.8/453.8 kB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 kB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Unidecode
  Downloading Unidecode-1.3.6-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.9/235.9 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Unidecode
Successfully installed Unidecode-1.3.6
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Dow

In [2]:
import string
import unidecode
import json
import pandas as pd
import numpy as np

import sparknlp
import pyspark.sql.functions as F

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from sparknlp.annotator import *
from sparknlp.base import *
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql.types import StringType, IntegerType

In [3]:
spark = sparknlp.start(gpu=True)
model_hatexplain = "bert_sequence_classifier_hatexplain"
model_emotion = "bert_sequence_classifier_emotion"


In [4]:
def run_pipeline(model, text, results):  
  document_assembler = DocumentAssembler() \
      .setInputCol('text') \
      .setOutputCol('document')

  tokenizer = Tokenizer() \
      .setInputCols(['document']) \
      .setOutputCol('token')

  sequenceClassifier = BertForSequenceClassification\
        .pretrained(model, 'en') \
        .setInputCols(['token', 'document']) \
        .setOutputCol('pred_class')

  pipeline = Pipeline(
      stages=[
          document_assembler, 
          tokenizer, 
          sequenceClassifier])

  df = spark.createDataFrame(text, StringType()).toDF("text")
  results[model]=(pipeline.fit(df).transform(df))

def preprocess_text(text):
    text = text.lower()
    text = unidecode.unidecode(text)
    return text

In [26]:
# text_hatexplain = ["""The police and the government protecting muslim gangs makes me mad. They are traitors!"""]
# text_hatexplain = ["""Muslims believe in all teachings of the Koran, even if it has 1400 years. This evil book is the main source of evil in all the world."""]
# text_hatexplain = ["""I think that Muslims can be a problem for our society, because they do not want to conform to our costumes. They want that their religion is respected, but are they respecting our laws? One day they will oppress us with the Sharia and polygamy."""]
text_hatexplain = ["""Muslims separate girls and boys in school from small age even though it is illegal. They are brain washed"""]
# text_hatexplain = ["""Parent are forcing children to religious activities than sports this is reason for sucides in young people."""]

text_hatexplain = [preprocess_text(text) for text in text_hatexplain]
model_dict = {model_hatexplain: text_hatexplain, model_emotion: text_hatexplain}


results = {}
for model, text in zip(model_dict.keys(),model_dict.values()):
  run_pipeline(model, text, results)

bert_sequence_classifier_hatexplain download started this may take some time.
Approximate size to download 391.1 MB
[OK!]
bert_sequence_classifier_emotion download started this may take some time.
Approximate size to download 391.1 MB
[OK!]


In [27]:
model_dfs = {}
for model_name, result in zip(results.keys(),results.values()):  
  res = result.select(F.explode(F.arrays_zip(result.document.result, 
                                             result.pred_class.result,
                                             result.pred_class.metadata)).alias("col"))\
              .select(F.expr("col['1']").alias("prediction"),
                      F.expr("col['2']").alias("confidence"),
                      F.expr("col['0']").alias("sentence"))
  if res.count() > 0:
        udf_func = F.udf(lambda x, y:  x[str(y)])
        print("\n",model_name,"\n") 
        res.withColumn('confidence', udf_func(res.confidence, res.prediction)).show(truncate=False)
        # add the resulting data frame to the dictionary
        model_dfs[model_name + "_df"] = res
        print("\n**********************************\n")


hate_df = model_dfs["bert_sequence_classifier_hatexplain_df"]
emotion_df = model_dfs["bert_sequence_classifier_emotion_df"]

hate_df = hate_df.withColumnRenamed('prediction', 'speech')
hate_df = hate_df.drop("confidence")
emotion_df = emotion_df.withColumnRenamed('prediction', 'emotion')
emotion_df = emotion_df.drop("confidence")

joined_df = hate_df.join(emotion_df, "sentence")
joined_df.show()


 bert_sequence_classifier_hatexplain 

+----------+----------+---------------------------------------------------------------------------------------------------------+
|prediction|confidence|sentence                                                                                                 |
+----------+----------+---------------------------------------------------------------------------------------------------------+
|offensive |0.48284486|muslims separate girls and boys in school from small age even though it is illegal. they are brain washed|
+----------+----------+---------------------------------------------------------------------------------------------------------+


**********************************


 bert_sequence_classifier_emotion 

+----------+----------+---------------------------------------------------------------------------------------------------------+
|prediction|confidence|sentence                                                                          

In [28]:
from pyspark.sql.functions import first, col
input_promt = joined_df.select(first(col(joined_df.columns[0]))).collect()[0][0]
print("input_promt :"+ input_promt)

from pyspark.sql.functions import when, concat_ws

def add_counterspeech_column(df):
    df_with_counterspeech = df.withColumn(
        "counterspeech",
        when((df.speech == "hate speech") & (df.emotion == "surprise"), "I'm surprised to hear that kind of language. Let's strive to use our words to create positive change instead.")
        .when((df.speech == "hate speech") & (df.emotion == "joy"), "There's nothing to celebrate about hate speech or offensive language. Let's focus on kindness and respect instead.")
        .when((df.speech == "hate speech") & (df.emotion == "anger"), "Your anger is misplaced. It's important to focus on the issue, not attack individuals or groups.")
        .when((df.speech == "hate speech") & (df.emotion == "love"), "Love is about acceptance and kindness. Let's work towards building a world where everyone feels valued and respected.")
        .when((df.speech == "hate speech") & (df.emotion == "fear"), "It's important to address real issues, but spreading fear and hate only makes things worse.")
        .when((df.speech == "hate speech") & (df.emotion == "sadness"), "It's sad that we live in a world where hate speech and offensive language exist. Let's work together to change that.")
        .when((df.speech == "offensive") & (df.emotion == "surprise"), "We should use our words to build bridges, not walls.")
        .when((df.speech == "offensive") & (df.emotion == "joy"), "It's important to recognize the harm that words can cause. Let's work towards building a more inclusive and respectful society.")
        .when((df.speech == "offensive") & (df.emotion == "anger"), "I understand you're angry, but your words are hurtful and not productive.")
        .when((df.speech == "offensive") & (df.emotion == "love"), "We should use our words to uplift and support each other, not tear each other down.")
        .when((df.speech == "offensive") & (df.emotion == "fear"), "We should try to understand each other instead of resorting to fear-mongering.")
        .when((df.speech == "offensive") & (df.emotion == "sadness"), "Words can hurt, and it's important to recognize that. Let's work towards creating a society where everyone feels safe and respected.")
        .otherwise("*")
    )
    return df_with_counterspeech

joined_df = add_counterspeech_column(joined_df)
joined_df.show()

joined_df = joined_df.filter(joined_df.counterspeech != '*')
joined_df.show()

input_promt :muslims separate girls and boys in school from small age even though it is illegal. they are brain washed
+--------------------+---------+-------+--------------------+
|            sentence|   speech|emotion|       counterspeech|
+--------------------+---------+-------+--------------------+
|muslims separate ...|offensive|  anger|I understand you'...|
+--------------------+---------+-------+--------------------+

+--------------------+---------+-------+--------------------+
|            sentence|   speech|emotion|       counterspeech|
+--------------------+---------+-------+--------------------+
|muslims separate ...|offensive|  anger|I understand you'...|
+--------------------+---------+-------+--------------------+



In [8]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [11]:
# from transformers output without training

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model_sample = GPT2LMHeadModel.from_pretrained("gpt2")

input_sequence = input_promt

input_ids = tokenizer.encode(input_sequence, return_tensors='pt')

generated_output = model_sample.generate(
    input_ids=input_ids,
    max_length=90,
    pad_token_id=tokenizer.eos_token_id,
)

print(tokenizer.decode(generated_output[0], skip_special_tokens=True))

parent are forcing children to religious activities than sports this is reason for sucides in young people.

The fact that the children are forced to participate in sports is a reason for sucides in young people.

The fact that the children are forced to participate in sports is a reason for sucides in young people.

The fact that the children are forced to participate in sports is a reason for sucides in young people.




In [12]:
from torch.utils.data import Dataset
import json

class ChatData(Dataset):
    def __init__(self, path:str, tokenizer):
        self.data = json.load(open(path, "r"))

        self.X = []
        for i in self.data["conan"]:
                self.X.append("<startofstring> "+i['hateSpeech']+" obsiously wrong because <bot>: " +i['counterSpeech']+" <endofstring>")

        print(self.X[0])

        # self.X = self.X[]  
        self.X_encoded = tokenizer(self.X,max_length=60, truncation=True, padding="max_length", return_tensors="pt")
        self.input_ids = self.X_encoded['input_ids']
        self.attention_mask = self.X_encoded['attention_mask']

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return (self.input_ids[idx], self.attention_mask[idx])

In [13]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.optim import Adam
from torch.utils.data import DataLoader
import tqdm
import torch

def train(chatData, generative_model, optim):

    epochs = 15
    for i in tqdm.tqdm(range(epochs)):
        for X, a in chatData:
            X = X.to(device)
            a = a.to(device)
            optim.zero_grad()
            loss = generative_model(X, attention_mask=a, labels=X).loss
            loss.backward()
            optim.step()
        torch.save(generative_model.state_dict(), "generative_model_state.pt")
        torch.save(generative_model, './generative_model_state1.pt')
        print(infer("We must send Islamics back to their native countries. They are too different from us."))

def infer(inp):
    inp = "<startofstring> "+inp+" obsiously wrong because <bot>: "
    inp = tokenizer(inp, return_tensors="pt")
    X = inp["input_ids"].to(device)
    a = inp["attention_mask"].to(device)
    output = generative_model.generate(X, attention_mask=a, max_new_tokens=60)#, pad_token_id=tokenizer.eos_token_id
    output = tokenizer.decode(output[0])
    return output.split("<pad>")[0]



In [15]:
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({"pad_token": "<pad>", 
                                "bos_token": "<startofstring>",
                                "eos_token": "<endofstring>"})
tokenizer.add_tokens(["<bot>:"])

generative_model = GPT2LMHeadModel.from_pretrained("gpt2")
generative_model.resize_token_embeddings(len(tokenizer))

generative_model = generative_model.to(device)

chatData = ChatData("./hate_speech_english.json", tokenizer)
chatData =  DataLoader(chatData, batch_size=64)

generative_model.train()

optim = Adam(generative_model.parameters(), lr=1e-3)

print("training .... ")
train(chatData, generative_model, optim)

<startofstring> According to a recent OFSTED report, a school in Birmingham is still segregating girls and boys, despite a Hight Court ruling in 2017 that this is unlawful. obsiously wrong because <bot>: To be fair, the OFSTED report is more concerned with lack of enforcement and less about focussing on the practice of any particular faith. <endofstring>
training .... 


  0%|          | 0/15 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  7%|▋         | 1/15 [01:04<14:57, 64.07s/it]

<startofstring> We must send Islamics back to their native countries. They are too different from us. obsiously wrong because <bot>:  wrong because 


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 13%|█▎        | 2/15 [02:04<13:26, 62.02s/it]

<startofstring> We must send Islamics back to their native countries. They are too different from us. obsiously wrong because <bot>: I think that is a fact that is a fact. 


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 20%|██        | 3/15 [03:05<12:19, 61.66s/it]

<startofstring> We must send Islamics back to their native countries. They are too different from us. obsiously wrong because <bot>: The number of Muslims are terrorists. <endofstring> 


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 27%|██▋       | 4/15 [04:06<11:14, 61.33s/it]

<startofstring> We must send Islamics back to their native countries. They are too different from us. obsiously wrong because <bot>: The situation of Muslims is dramatic. <endofstring> 


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 33%|███▎      | 5/15 [05:07<10:11, 61.13s/it]

<startofstring> We must send Islamics back to their native countries. They are too different from us. obsiously wrong because <bot>: The only ones that are Muslims are the ones that are sent back to their home countries. We should send them back to their motherland, where they are not allowed to enter. Try living in a stable, but stable country, and they are not allowed to enter. But please do not. <endofstring>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 40%|████      | 6/15 [06:08<09:10, 61.17s/it]

<startofstring> We must send Islamics back to their native countries. They are too different from us. obsiously wrong because <bot>: The problem is that muslims are not really concerned with Islamic radicalisation and Islamic radicalisation. But it is not the case for Muslims. They are concerned with Islamic radicalisation and Islamic radicalisation. The problem is that they are not really about migration. It is the migration to our own country


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 47%|████▋     | 7/15 [07:09<08:08, 61.02s/it]

<startofstring> We must send Islamics back to their native countries. They are too different from us. obsiously wrong because <bot>: There are too many Muslims in our countries, and we cannot even take the dog for a walk. We could send the dog back to Pakistan. We could send the dog back to Pakistan. <endofstring> 


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 53%|█████▎    | 8/15 [08:10<07:06, 60.94s/it]

<startofstring> We must send Islamics back to their native countries. They are too different from us. obsiously wrong because <bot>: So what have you done to deserve? <endofstring> 


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 60%|██████    | 9/15 [09:11<06:06, 61.05s/it]

<startofstring> We must send Islamics back to their native countries. They are too different from us. obsiously wrong because <bot>: I am not sure what Muslims would say, if they were British citizens, they would disagree. <endofstring> 


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 67%|██████▋   | 10/15 [10:12<05:04, 60.99s/it]

<startofstring> We must send Islamics back to their native countries. They are too different from us. obsiously wrong because <bot>: I am not sure I would use a muslim to describe my opinion. I am not even trying to become a muslim. I am not even trying to become a muslim. I am not even trying to become a realist. I am thinking of how to describe the situation of muslim


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 73%|███████▎  | 11/15 [11:13<04:04, 61.00s/it]

<startofstring> We must send Islamics back to their native countries. They are too different from us. obsiously wrong because <bot>: I am not sure I would use 'them' to describe a 'them and them' would be a better place. <endofstring> 


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 80%|████████  | 12/15 [12:14<03:03, 61.05s/it]

<startofstring> We must send Islamics back to their native countries. They are too different from us. obsiously wrong because <bot>: I am sure Islam is a country where people live and work. <endofstring> 


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 87%|████████▋ | 13/15 [13:15<02:01, 60.98s/it]

<startofstring> We must send Islamics back to their native countries. They are too different from us. obsiously wrong because <bot>: I am not sure I am the best plan. <endofstring> 


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 93%|█████████▎| 14/15 [14:16<01:01, 61.00s/it]

<startofstring> We must send Islamics back to their native countries. They are too different from us. obsiously wrong because <bot>: Why would you say that? I would not say that Muslims are dangerous, violent, intolerant, biased, sexist and male chauvinist. <endofstring> 


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
100%|██████████| 15/15 [15:17<00:00, 61.18s/it]

<startofstring> We must send Islamics back to their native countries. They are too different from us. obsiously wrong because <bot>: Why would you say that? Would you say the same if it were really true? Would you say the same if every other religion, or religion, or state, or nation, was a nation? Would you say the same? <endofstring> 





In [16]:
def get_user_input():
  input_speech = joined_df.select(first(col(joined_df.columns[1]))).collect()[0][0]
  print("INPUT_speech is classified as : " + input_speech)
  if (input_speech == "hate speech" or "offensive"):
    print("Process Counter Speech ... ")
    process_counterspeech(input_promt)
  else:
    print("Speech is normal!")

def process_counterspeech(inp):
  print("INPUT Offensive/Hate speech: " + inp + '\n')
  for i in range(1,4):
    output = ""
    output = infer(inp)
    output = output.split("<endofstring>")[0].split("obsiously wrong because <bot>:")[1]
    if (output == "") :
        output = joined_df.select(first(col(joined_df.columns[3]))).collect()[0][0]    
    print(f"OUTPUT Counter Speech {i}: "+ output + '\n')

In [30]:
print("infer from model : ")
get_user_input()


infer from model : 


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


INPUT_speech is classified as : offensive
Process Counter Speech ... 
INPUT Offensive/Hate speech: muslims separate girls and boys in school from small age even though it is illegal. they are brain washed



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


OUTPUT Counter Speech 1:  Why do you think it is a matter of fact? 



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


OUTPUT Counter Speech 2:  Gender segregation in faith is clearly a negative, but if you are serious about this, do not worry about it. 

OUTPUT Counter Speech 3:  Gender segregation in faith schools is clearly a negative, but not where you think it is. 



In [29]:
print(model)
print(generative_model)

bert_sequence_classifier_emotion
GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50261, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50261, bias=False)
)


In [None]:
#loading existing model
generative_model = torch.load('./generative_model_state1.pt')



def infer(inp):
    inp = "<startofstring> "+inp+" obsiously wrong because <bot>: "
    inp = tokenizer(inp, return_tensors="pt")
    X = inp["input_ids"].to(device)
    a = inp["attention_mask"].to(device)
    output = generative_model.generate(X, attention_mask=a, max_new_tokens=60)#, pad_token_id=tokenizer.eos_token_id
    output = tokenizer.decode(output[0])
    return output.split("<pad>")[0]

def process_counterspeech(inp):
  while True:
    print("INPUT : " + inp )
    print("infer from model : ")
    output = ""
    output = infer(inp).split("<endofstring>")[0].split("obsiously wrong because <bot>: ")[1]
    if (output == "") :
        output = joined_df.select(first(col(joined_df.columns[3]))).collect()[0][0]    
    print("OUTPUT : "+ output)


