# **Detect bullying in tweets**

## 1. Colab Setup

In [1]:
# Install PySpark and Spark NLP
!pip install -q pyspark==3.1.2 spark-nlp

In [2]:
import pandas as pd
import numpy as np
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from sparknlp.annotator import *
from sparknlp.base import *
from pyspark.ml.feature import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

## 2. Start Spark Session

In [3]:
spark = sparknlp.start(gpu = True)

## 3. Select the DL model

## 4. Some sample examples

In [4]:
!pip install demoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting demoji
  Downloading demoji-1.1.0-py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 1.4 MB/s 
[?25hInstalling collected packages: demoji
Successfully installed demoji-1.1.0


In [100]:
import demoji
import re
import pandas as pd
import string
pd_data = pd.read_csv('/content/tweets.csv')
pd_data.shape

(47692, 2)

In [101]:
def clean(text):
    # pattern = re.compile(r"(#[A-Za-z0-9]+|@[A-Za-z0-9]+|https?:\/\/\S+|www\.\S+|\S+\.[a-z]+|RT @|)")
    # text = pattern.sub('', text)
    text = re.sub(r'[^\w\s]', '', text) #removing punctuations
    # text = re.sub(r'[\s]+', ' ',text) # removing extra spaces
    emoji = demoji.findall(text)
    for emot in emoji:
        text = re.sub(r"(%s)" % (emot), "_".join(emoji[emot].split()), text)
    return text
pd_data['tweet_text'] = pd_data['tweet_text'].apply(lambda text: clean(text))
# pd_data['tweet_text'] = pd_data['tweet_text'].apply(lambda x: ' '.join([w for w in x.spilt() if len(w)>3]))

In [102]:
data = spark.createDataFrame(pd_data)
data.show(1,False)

+---------------------------------------------------------+------------------+
|tweet_text                                               |cyberbullying_type|
+---------------------------------------------------------+------------------+
|In other words katandandre your food was crapilicious mkr|not_cyberbullying |
+---------------------------------------------------------+------------------+
only showing top 1 row



In [103]:
data = data.dropDuplicates(['tweet_text'])
data.count()

45959

In [104]:
train, test = data.randomSplit([0.90,0.10],4192)

In [105]:
train.toPandas()['cyberbullying_type'].value_counts()

age                    7195
religion               7157
ethnicity              7155
not_cyberbullying      7140
gender                 7106
other_cyberbullying    5637
Name: cyberbullying_type, dtype: int64

## 5. Define Spark NLP pipeline

In [106]:
documentAssembler = DocumentAssembler()\
    .setInputCol("tweet_text")\
    .setOutputCol("document")

use = UniversalSentenceEncoder.pretrained(name="tfhub_use_lg", lang="en")\
 .setInputCols("document")\
 .setOutputCol("sentence_embeddings")\

# tokenizer = sparknlp.annotator.token.tokenize.Tokenizer() \
#                 .setInputCols(["document"]) \
#                 .setOutputCol("token")
      
# bert_embeddings = BertEmbeddings().pretrained(name='small_bert_L4_256', lang='en') \
#                 .setInputCols(["document",'token'])\
#                 .setOutputCol("embeddings")


classifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("cyberbullying_type")\
  .setBatchSize(16)\
  .setMaxEpochs(42)\
  .setDropout(0.4) \
  .setEnableOutputLogs(True)\
  .setLr(4e-3) 
use_clf_pipeline = Pipeline(
    stages = [documentAssembler,
        use,
        classifierdl])

tfhub_use_lg download started this may take some time.
Approximate size to download 753.3 MB
[OK!]


In [107]:
! rm -r /root/annotator_logs

## 6. Run the pipeline

In [108]:
pipelineModel = use_clf_pipeline.fit(train)

In [109]:
log_files = os.listdir("/root/annotator_logs")
log_files

['ClassifierDLApproach_9d801c5766c0.log']

In [None]:
pipelineModel.stages

[DocumentAssembler_e0cbf13231c2,
 UNIVERSAL_SENTENCE_ENCODER_5e0d8b922c74,
 ClassifierDLModel_8b8dcf894745]

In [61]:
pipelineModel.stages[2].write().overwrite().save("/content/drive/MyDrive/NLP/model")

In [110]:
import os
log_file_name = os.listdir("/root/annotator_logs")[0]

with open("/root/annotator_logs/"+log_file_name, "r") as log_file :
    print(log_file.read())

Training started - epochs: 42 - learning_rate: 0.004 - batch_size: 16 - training_examples: 41389 - classes: 6
Epoch 0/42 - 5.19s - loss: 3252.3345 - acc: 0.7725865 - batches: 2587
Epoch 1/42 - 4.81s - loss: 3131.0315 - acc: 0.81979334 - batches: 2587
Epoch 2/42 - 4.85s - loss: 3095.2124 - acc: 0.8350977 - batches: 2587
Epoch 3/42 - 4.75s - loss: 3070.1501 - acc: 0.84705555 - batches: 2587
Epoch 4/42 - 4.95s - loss: 3053.4421 - acc: 0.85505533 - batches: 2587
Epoch 5/42 - 4.76s - loss: 3037.3408 - acc: 0.86121833 - batches: 2587
Epoch 6/42 - 4.74s - loss: 3019.79 - acc: 0.86733854 - batches: 2587
Epoch 7/42 - 4.74s - loss: 3006.9102 - acc: 0.87125385 - batches: 2587
Epoch 8/42 - 4.83s - loss: 2996.5403 - acc: 0.875 - batches: 2587
Epoch 9/42 - 4.78s - loss: 2988.2385 - acc: 0.8781903 - batches: 2587
Epoch 10/42 - 4.92s - loss: 2980.568 - acc: 0.88121134 - batches: 2587
Epoch 11/42 - 4.76s - loss: 2975.8403 - acc: 0.8830481 - batches: 2587
Epoch 12/42 - 4.75s - loss: 2970.9238 - acc: 0.8

In [111]:
result = pipelineModel.transform(test)

## 7. Visualize results

In [None]:
result.columns

['tweet_text',
 'cyberbullying_type',
 'document',
 'sentence_embeddings',
 'class']

In [112]:
preds_df = result.select('cyberbullying_type','tweet_text',"class.result").toPandas()

In [113]:
preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

In [114]:
from sklearn.metrics import classification_report, accuracy_score
print (accuracy_score(preds_df['cyberbullying_type'], preds_df['result']))

0.8426351499233968


In [None]:
# from sklearn.metrics import classification_report, accuracy_score
# print (accuracy_score(preds_df['cyberbullying_type'], preds_df['result']))

In [115]:
df = pd.read_csv('/content/tweet.csv')
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
sw = set(stopwords.words('english'))
sw.remove('not')
sw.update(['rt', 'mkr', 'didn', 'bc', 'n', 'm', 
                  'im', 'll', 'y', 've', 'u', 'ur', 'don', 
                  'p', 't', 's', 'aren', 'kp', 'o', 'kat', 
                  'de', 're', 'amp', 'will', 'wa', 'e', 'like'])
def clean_new(text):
    # pattern = re.compile(r"(#[A-Za-z0-9]+|@\w+|https?://\S+|www\.\S+|\S+\.[a-z]+|RT @|[^\x00-\x7F]+)")
    # text = pattern.sub('', text)
    text = text.lower()
    remove_punc = re.compile(r"[%s]" % re.escape(string.punctuation))
    text = remove_punc.sub('', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    # text = ' '.join([word for word in text.split() if word not in sw])
    text = text.strip()
    return text
df['tweet_text'] = df['tweet_text'].apply(lambda text: clean_new(text))
new_data = spark.createDataFrame(df)
predictions = pipelineModel.transform(new_data)
pred =predictions.select('tweet_text',"class.result")
pred.show(truncate=False)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+
|tweet_text                                                                                                                                                                                                                                                                               |result               |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+
|priyankac19 narendramodi sponsored troll mrsgandhi                               

In [116]:
sent = clean_new('The anonymous user posted a series of inflammatory comments on a forum dedicated to discussing racial issues, using racial slurs and promoting harmful stereotypes about black people.')
new_data = spark.createDataFrame(pd.DataFrame([sent],columns=['tweet_text']))
predictions = pipelineModel.transform(new_data)
predictions.select('tweet_text',"class.result").toPandas()

Unnamed: 0,tweet_text,result
0,the anonymous user posted a series of inflamma...,[ethnicity]


In [None]:
# df = pd.read_csv('../input/testdf/tweet.csv')
# df['tweet_text'] = df['tweet_text'].apply(lambda text: clean(text))
# new_data = spark.createDataFrame(df)
# predictions = pipelineModel.transform(new_data)
# predictions.select('tweet_text',"class.result").toPandas()

In [66]:
clss_model = ClassifierDLModel.load('/content/drive/MyDrive/NLP/model')\
.setInputCols(['sentence_embeddings'])\
.setOutputCol("class")

nlpPipeline = Pipeline(stages=[documentAssembler, use, clss_model])
light_pipeline = LightPipeline(nlpPipeline.fit(spark.createDataFrame([['']]).toDF("text")))
ld_preds = light_pipeline.transform(new_data)


In [67]:
ld_preds_df = ld_preds.select('tweet_text',"class.result").toPandas()

In [68]:
ld_preds_df

Unnamed: 0,tweet_text,result
0,your jewish faith is disgusting and inferior y...,[religion]
