# **Detect bullying in tweets**

## 1. Colab Setup

In [63]:
# Install PySpark and Spark NLP
!pip install -q pyspark==3.1.2 spark-nlp

[0m

In [64]:
import pandas as pd
import numpy as np
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from sparknlp.annotator import *
from sparknlp.base import *
from pyspark.ml.feature import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

## 2. Start Spark Session

In [65]:
spark = sparknlp.start(gpu = True)

## 3. Select the DL model

## 4. Some sample examples

In [66]:
!pip install demoji

[0m

In [67]:
import demoji
import re
import pandas as pd
import string
pd_data = pd.read_csv('../input/cyberbullying-classification/cyberbullying_tweets.csv')
pd_data.shape

(47692, 2)

In [68]:
def clean(text):
#     pattern = re.compile(r"(#[A-Za-z0-9]+|@[A-Za-z0-9]+|https?:\/\/\S+|www\.\S+|\S+\.[a-z]+|RT @|)")
#     text = pattern.sub('', text)
    remove_punc = re.compile(r"[%s]" % re.escape(string.punctuation))
    text = remove_punc.sub('', text)
    # text = re.sub(r'[\s]+', ' ',text)
    emoji = demoji.findall(text)
    for emot in emoji:
        text = re.sub(r"(%s)" % (emot), "_".join(emoji[emot].split()), text)
    return text
pd_data['tweet_text'] = pd_data['tweet_text'].apply(lambda text: clean(text))
# pd_data['tweet_text'] = pd_data['tweet_text'].apply(lambda x: ' '.join([w for w in x.spilt() if len(w)>3]))

In [69]:
data = spark.createDataFrame(pd_data)
data.show(1,False)

+---------------------------------------------------------+------------------+
|tweet_text                                               |cyberbullying_type|
+---------------------------------------------------------+------------------+
|In other words katandandre your food was crapilicious mkr|not_cyberbullying |
+---------------------------------------------------------+------------------+
only showing top 1 row



22/11/03 10:25:23 WARN TaskSetManager: Stage 62 contains a task of very large size (1123 KiB). The maximum recommended task size is 1000 KiB.


In [70]:
data = data.dropDuplicates(['tweet_text'])
data.count()

22/11/03 10:25:23 WARN TaskSetManager: Stage 63 contains a task of very large size (1123 KiB). The maximum recommended task size is 1000 KiB.


45979

In [71]:
train, test = data.randomSplit([0.90,0.10],4192)

In [72]:
train.toPandas()['cyberbullying_type'].value_counts()

22/11/03 10:25:24 WARN TaskSetManager: Stage 66 contains a task of very large size (1123 KiB). The maximum recommended task size is 1000 KiB.


religion               7195
age                    7176
not_cyberbullying      7140
ethnicity              7136
gender                 7118
other_cyberbullying    5655
Name: cyberbullying_type, dtype: int64

## 5. Define Spark NLP pipeline

In [73]:
documentAssembler = DocumentAssembler()\
    .setInputCol("tweet_text")\
    .setOutputCol("document")

use = UniversalSentenceEncoder.pretrained(name="tfhub_use_lg", lang="en")\
 .setInputCols("document")\
 .setOutputCol("sentence_embeddings") 

classifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("cyberbullying_type")\
  .setBatchSize(32)\
  .setMaxEpochs(40)\
  .setDropout(0.4) \
  .setRandomSeed(4103)\
.setLr(4e-3)
use_clf_pipeline = Pipeline(
    stages = [documentAssembler,
        use,
        classifierdl])

tfhub_use_lg download started this may take some time.
Approximate size to download 753.3 MB
[OK!]


## 6. Run the pipeline

In [74]:
pipelineModel = use_clf_pipeline.fit(train)

22/11/03 10:25:29 WARN TaskSetManager: Stage 68 contains a task of very large size (1123 KiB). The maximum recommended task size is 1000 KiB.
22/11/03 10:32:27 WARN TaskSetManager: Stage 71 contains a task of very large size (1123 KiB). The maximum recommended task size is 1000 KiB.
22/11/03 10:32:27 WARN TaskSetManager: Stage 73 contains a task of very large size (1123 KiB). The maximum recommended task size is 1000 KiB.
2022-11-03 10:39:26.435703: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/d0c9e9b6a0e5_classifier_dl15572764394216352173
2022-11-03 10:39:26.535994: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2022-11-03 10:39:26.536068: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/d0c9e9b6a0e5_classifier_dl15572764394216352173
2022-11-03 10:39:27.091199: I external/org_tensorflow/tensorflow/cc/saved

Training started - epochs: 40 - learning_rate: 0.004 - batch_size: 32 - training_examples: 41419 - classes: 6
Epoch 1/40 - 9.93s - loss: 1623.983 - acc: 0.78055406 - batches: 1295
Epoch 2/40 - 9.39s - loss: 1577.5076 - acc: 0.8189019 - batches: 1295
Epoch 3/40 - 9.70s - loss: 1559.5613 - acc: 0.833585 - batches: 1295
Epoch 4/40 - 9.78s - loss: 1549.4077 - acc: 0.84222853 - batches: 1295
Epoch 5/40 - 9.51s - loss: 1543.3098 - acc: 0.8488698 - batches: 1295
Epoch 6/40 - 9.51s - loss: 1536.8386 - acc: 0.85452086 - batches: 1295
Epoch 7/40 - 9.50s - loss: 1530.629 - acc: 0.8591576 - batches: 1295
Epoch 8/40 - 9.60s - loss: 1524.9606 - acc: 0.863408 - batches: 1295
Epoch 9/40 - 9.31s - loss: 1519.0955 - acc: 0.8665475 - batches: 1295
Epoch 10/40 - 9.37s - loss: 1515.7308 - acc: 0.87007344 - batches: 1295
Epoch 11/40 - 9.52s - loss: 1512.4517 - acc: 0.8726816 - batches: 1295
Epoch 12/40 - 9.36s - loss: 1509.8987 - acc: 0.87468606 - batches: 1295
Epoch 13/40 - 9.51s - loss: 1508.3595 - acc: 0

In [75]:
# pipelineModel.save("/content/drive/MyDrive/NLP/model")

In [76]:
# pipelineModel.load('/content/drive/MyDrive/NLP/model')

In [77]:
result = pipelineModel.transform(test)

## 7. Visualize results

In [78]:
result.columns

['tweet_text',
 'cyberbullying_type',
 'document',
 'sentence_embeddings',
 'class']

In [79]:
preds_df = result.select('cyberbullying_type','tweet_text',"class.result").toPandas()

22/11/03 10:45:51 WARN TaskSetManager: Stage 75 contains a task of very large size (1123 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [80]:
preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

In [81]:
from sklearn.metrics import classification_report, accuracy_score
print (accuracy_score(preds_df['cyberbullying_type'], preds_df['result']))

0.8409738977846019


In [82]:
# from sklearn.metrics import classification_report, accuracy_score
# print (accuracy_score(preds_df['cyberbullying_type'], preds_df['result']))

In [98]:
df = pd.read_csv('../input/testdf/tweet.csv')
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
sw = set(stopwords.words('english'))
sw.remove('not')
sw.update(['rt', 'mkr', 'didn', 'bc', 'n', 'm', 
                  'im', 'll', 'y', 've', 'u', 'ur', 'don', 
                  'p', 't', 's', 'aren', 'kp', 'o', 'kat', 
                  'de', 're', 'amp', 'will', 'wa', 'e', 'like'])
def clean_new(text):
    pattern = re.compile(r"(#[A-Za-z0-9]+|@\w+|https?://\S+|www\.\S+|\S+\.[a-z]+|RT @|[^\x00-\x7F]+)")
    text = pattern.sub('', text)
    text = text.lower()
    remove_punc = re.compile(r"[%s]" % re.escape(string.punctuation))
    text = remove_punc.sub('', text)
    text = ' '.join([word for word in text.split() if word not in sw])
    return text
df['tweet_text'] = df['tweet_text'].apply(lambda text: clean_new(text))
new_data = spark.createDataFrame(df)
predictions = pipelineModel.transform(new_data)
predictions.select('tweet_text',"class.result").toPandas()

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,tweet_text,result
0,sponsored troll,[other_cyberbullying]
1,height corruption mr,[other_cyberbullying]
2,wish great birthday dear mayank,[not_cyberbullying]
3,party bjp great messagelearning education grea...,[religion]
4,narendra modi stroked azzyland,[not_cyberbullying]
5,working rejected payment 21 22 also need 2021 ...,[not_cyberbullying]
6,compensation paid recovered bridge contractors,[not_cyberbullying]
7,get bothered facts modinomics modi really emba...,[not_cyberbullying]
8,tanks behind fcuk liar visible foam yamuna par...,[not_cyberbullying]
9,watch rishi sunak becomes first person indian ...,[not_cyberbullying]


In [97]:
sent = clean_new('height of corruption mr narendramodi')
new_data = spark.createDataFrame(pd.DataFrame([sent],columns=['tweet_text']))
predictions = pipelineModel.transform(new_data)
predictions.select('tweet_text',"class.result").toPandas()

Unnamed: 0,tweet_text,result
0,height corruption mr narendramodi,[other_cyberbullying]


In [85]:
# df = pd.read_csv('../input/testdf/tweet.csv')
# df['tweet_text'] = df['tweet_text'].apply(lambda text: clean(text))
# new_data = spark.createDataFrame(df)
# predictions = pipelineModel.transform(new_data)
# predictions.select('tweet_text',"class.result").toPandas()