# Piplene of prediction

In [208]:
import numpy as np
import pandas as pd
import re
import string
import random
import time
import gc
import pickle

import nltk
nltk.download('stopwords')
import emoji

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup, AutoModel, AutoTokenizer, BertTokenizerFast
import pytorch_lightning as pl

from sklearn.metrics import accuracy_score, roc_curve, auc
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\makch\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data cleaning

In [209]:
arabic_stopwords = set(nltk.corpus.stopwords.words("arabic"))

arabic_diacritics = re.compile("""
                            ّ    | # Tashdid
                            َ    | # Fatha
                            ً    | # Tanwin Fath
                            ُ    | # Damma
                            ٌ    | # Tanwin Damm
                            ِ    | # Kasra
                            ٍ    | # Tanwin Kasr
                            ْ    | # Sukun
                            ـ     # Tatwil/Kashida
                        """, re.VERBOSE)

arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
english_punctuations = string.punctuation
punctuations = arabic_punctuations + english_punctuations


def remove_urls (text):
    text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE)
    return text


def remove_emails(text):
    text = re.sub(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", "",  text, flags=re.MULTILINE)
    return text

# def remove_emoji(text):
#     return emoji.get_emoji_regexp().sub(u'', text)

def remove_emoji(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                    "]+", re.UNICODE)
    return re.sub(emoj, '', data)

def normalization(text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)
    return text

def remove_diacritics(text):
    text = re.sub(arabic_diacritics, '', text)
    return text

def remove_stopwords(text):
    filtered_sentence = [w for w in text.split() if not w in arabic_stopwords]
    return ' '.join(filtered_sentence)

def cleaning_content(line):
    if (isinstance(line, float)):
        return None
    line.replace('\n', ' ')
    line = remove_emails(line)
    line = remove_urls(line)
    line = remove_emoji(line)
    nline = [w if '@' not in w else 'USERID' for w in line.split()]
    line = ' '.join(nline)
    line = line.replace('RT', '').replace('<LF>', '').replace('<br />','').replace('&quot;', '').replace('<url>', '').replace('USERID', '')


    # add spaces between punc,
    line = line.translate(str.maketrans({key: " {0} ".format(key) for key in punctuations}))

    # then remove punc,
    translator = str.maketrans('', '', punctuations)
    line = line.translate(translator)

    line = remove_stopwords(line)
    line=remove_diacritics(normalization(line))

    line = line.strip()
    return line

def hasDigits(s):
    return any( 48 <= ord(char) <= 57  or 1632 <= ord(char) <= 1641 for char in s)

In [210]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(torch.cuda.get_device_name(device))

NVIDIA GeForce GTX 1650


# Offensive detection model

In [211]:
tokenizer = BertTokenizer.from_pretrained('aubmindlab/bert-base-arabertv02', do_lower_case=True)

In [212]:
class BertClassifier(nn.Module):
    """Bert Model for Classification Tasks.
    """
    def __init__(self, freeze_bert=False):
        """
        @param    bert: a BertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        """
        super(BertClassifier, self).__init__()
        #  hidden size of BERT, hidden size of our classifier, number of labels
        D_in, H, D_out = 768, 50, 2

        # Instantiate BERT model
        self.bert = BertModel.from_pretrained('aubmindlab/bert-base-arabertv02')

        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(H, D_out)
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                        max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                        information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                        num_labels)
        """

        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)  
        
        # Extract the last hidden state of the token `[CLS]` for classification task and feed them to classifier to compute logits 
        last_hidden_state_cls = outputs[0][:, 0, :]
        logits = self.classifier(last_hidden_state_cls)

        return logits

In [213]:
# load the dictionary

with open('../models/offensive_dict.pkl', 'rb') as handle:
    off_dictionary = pickle.load(handle)

off_model = BertClassifier()
off_model.load_state_dict(torch.load('../models/modelv3.pt'))

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02 were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [214]:
MAX_LEN =  256

# Load MAX_LEN
with open('../models/offensive_max_len.pkl', 'rb') as handle:
    MAX_LEN = pickle.load(handle)


def predict_off(review_text,model,device,tokenizer):
    
    model.to(device)

    encoded_review = tokenizer.encode_plus(
    review_text,
    max_length=MAX_LEN,
    add_special_tokens=True,
    return_token_type_ids=False,
    padding='longest',
    return_attention_mask=True,
    return_tensors='pt',
    )

    input_ids = encoded_review['input_ids'].to(device)
    attention_mask = encoded_review['attention_mask'].to(device)
    output = model(input_ids, attention_mask)
    _, prediction = torch.max(output, dim=1)
    #print(f'Review text: {review_text}')
    index = output.cpu().data.numpy().argmax()
    #print(f'Sentiment  : {index}')
    # decode the output of the model to get the predicted label
    pred = off_dictionary[index]
    
    return pred

##  Racism detection model

In [215]:
racism_tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-medium-arabic")

In [216]:
class Racism_Model(pl.LightningModule):
    def __init__(self,model_type="Mini"):
        super().__init__()
        model = {"Mini": ("asafaya/bert-mini-arabic",256),
                "Medium": ("asafaya/bert-medium-arabic",512),
                "Base": ("asafaya/bert-base-arabic",768),
                "Large": ("asafaya/bert-large-arabic",1024)}
        self.bert_model = AutoModel.from_pretrained(model[model_type][0])
        self.fc = nn.Linear(model[model_type][1],18)
    
    def forward(self,input_ids,attention_mask):
        out = self.bert_model(input_ids = input_ids, attention_mask =attention_mask)#inputs["input_ids"],inputs["token_type_ids"],inputs["attention_mask"])
        pooler = out[1]
        out = self.fc(pooler)
        return out

In [217]:
#load model
racism_model = Racism_Model('Medium')
racism_model.load_state_dict(torch.load('../models/racism/racism_arabert.pt'))

# dictionary
racism_dict = {0: 'Not_Racism', 1: 'Racism'}

Some weights of the model checkpoint at asafaya/bert-medium-arabic were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [218]:
# load max len
with open('../models/racism/racism_arabert_maxlen.pickle', 'rb') as handle:
    racism_max_len = pickle.load(handle)

def predict_racism(review_text,model,device,tokenizer):
    
    model.to(device)

    encoded_review = tokenizer.encode_plus(
    review_text,
    max_length=racism_max_len,
    add_special_tokens=True,
    return_token_type_ids=False,
    padding='longest',
    return_attention_mask=True,
    return_tensors='pt',
    )

    input_ids = encoded_review['input_ids'].to(device)
    attention_mask = encoded_review['attention_mask'].to(device)
    output = model(input_ids, attention_mask)
    _, prediction = torch.max(output, dim=1)
    #print(f'Review text: {review_text}')
    index = output.cpu().data.numpy().argmax()
    #print(f'Sentiment  : {index}')
    # decode the output of the model to get the predicted label
    pred = racism_dict[index]

    return pred

# Religion Hate detection model

In [219]:
Religion_Hate_tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-medium-arabic")

In [220]:
class ReligionHateModel(pl.LightningModule):
    def __init__(self,model_type="Mini"):
        super().__init__()
        model = {"Mini": ("asafaya/bert-mini-arabic",256),
                "Medium": ("asafaya/bert-medium-arabic",512),
                "Base": ("asafaya/bert-base-arabic",768),
                "Large": ("asafaya/bert-large-arabic",1024)}
        self.bert_model = AutoModel.from_pretrained(model[model_type][0])
        self.fc = nn.Linear(model[model_type][1],18)
    
    def forward(self,input_ids,attention_mask):
        out = self.bert_model(input_ids = input_ids, attention_mask =attention_mask)#inputs["input_ids"],inputs["token_type_ids"],inputs["attention_mask"])
        pooler = out[1]
        out = self.fc(pooler)
        return out

In [221]:
#load model
Religion_Hate_model = ReligionHateModel('Medium')
Religion_Hate_model.load_state_dict(torch.load('../models/religion_hate/religion_hate_params.pt'))

# dictionary
Religion_Hate_dict = {0: 'Religion Hate', 1: 'Not Religion Hate'}

Some weights of the model checkpoint at asafaya/bert-medium-arabic were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [222]:
def Religion_Hate_predict(review_text,model,device,tokenizer):
    
    model.to(device)

    encoded_review = tokenizer.encode_plus(
    review_text,
    max_length=60,
    add_special_tokens=True,
    return_token_type_ids=False,
    padding='longest',
    return_attention_mask=True,
    return_tensors='pt',
    )

    input_ids = encoded_review['input_ids'].to(device)
    attention_mask = encoded_review['attention_mask'].to(device)
    output = model(input_ids, attention_mask)
    _, prediction = torch.max(output, dim=1)
    #print(f'Review text: {review_text}')
    index = output.cpu().data.numpy().argmax()
    #print(f'Sentiment  : {index}')
    # decode the output of the model to get the predicted label
    pred = Religion_Hate_dict[index]

    return pred

# Verbal Abuse detection model

In [223]:
verbal_abuse_tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-medium-arabic")

In [224]:
class verbalabuseModel(pl.LightningModule):
    def __init__(self,model_type="Mini"):
        super().__init__()
        model = {"Mini": ("asafaya/bert-mini-arabic",256),
                "Medium": ("asafaya/bert-medium-arabic",512),
                "Base": ("asafaya/bert-base-arabic",768),
                "Large": ("asafaya/bert-large-arabic",1024)}
        self.bert_model = AutoModel.from_pretrained(model[model_type][0])
        self.fc = nn.Linear(model[model_type][1],18)
    
    def forward(self,input_ids,attention_mask):
        out = self.bert_model(input_ids = input_ids, attention_mask =attention_mask)#inputs["input_ids"],inputs["token_type_ids"],inputs["attention_mask"])
        pooler = out[1]
        out = self.fc(pooler)
        return out

In [225]:
#load model
verbal_abuse_model = verbalabuseModel('Medium')
verbal_abuse_model.load_state_dict(torch.load('../models/verbal_abuse/verbal_abuse_arabert.pt'))

# dictionary
verbal_abuse_dict = {0: 'Verbal Abuse', 1: 'Not Verbal Abuse'}

Some weights of the model checkpoint at asafaya/bert-medium-arabic were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [226]:
def predict_verbal_abuse(review_text,model,device,tokenizer):
    
    model.to(device)

    encoded_review = tokenizer.encode_plus(
    review_text,
    max_length=60,
    add_special_tokens=True,
    return_token_type_ids=False,
    padding='longest',
    return_attention_mask=True,
    return_tensors='pt',
    )

    input_ids = encoded_review['input_ids'].to(device)
    attention_mask = encoded_review['attention_mask'].to(device)
    output = model(input_ids, attention_mask)
    _, prediction = torch.max(output, dim=1)
    #print(f'Review text: {review_text}')
    index = output.cpu().data.numpy().argmax()
    #print(f'Sentiment  : {index}')
    # decode the output of the model to get the predicted label
    pred = verbal_abuse_dict[index]

    return pred

# Misogony detection model

In [227]:
misogyny_tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-medium-arabic")

In [228]:
class ArabicBertModel(pl.LightningModule):
    def __init__(self,model_type="Mini"):
        super().__init__()
        model = {"Mini": ("asafaya/bert-mini-arabic",256),
                "Medium": ("asafaya/bert-medium-arabic",512),
                "Base": ("asafaya/bert-base-arabic",768),
                "Large": ("asafaya/bert-large-arabic",1024)}
        self.bert_model = AutoModel.from_pretrained(model[model_type][0])
        self.fc = nn.Linear(model[model_type][1],18)
    
    def forward(self,input_ids,attention_mask):
        out = self.bert_model(input_ids = input_ids, attention_mask =attention_mask)#inputs["input_ids"],inputs["token_type_ids"],inputs["attention_mask"])
        pooler = out[1]
        out = self.fc(pooler)
        return out


In [229]:
misogyny_dict = {0: 'misogyny', 1: 'non_misogyny'}

In [230]:
misogyny_model = ArabicBertModel('Medium')
misogyny_model.load_state_dict(torch.load('../models/misogyny/misogyny.pt'))

Some weights of the model checkpoint at asafaya/bert-medium-arabic were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [231]:
def predict_misogyny(review_text,model,device,tokenizer):
    
    model.to(device)

    encoded_review = tokenizer.encode_plus(
    review_text,
    max_length=60,
    add_special_tokens=True,
    return_token_type_ids=False,
    padding='longest',
    return_attention_mask=True,
    return_tensors='pt',
    )

    input_ids = encoded_review['input_ids'].to(device)
    attention_mask = encoded_review['attention_mask'].to(device)
    output = model(input_ids, attention_mask)
    _, prediction = torch.max(output, dim=1)
    #print(f'Review text: {review_text}')
    index = output.cpu().data.numpy().argmax()
    #print(f'Sentiment  : {index}')
    # decode the output of the model to get the predicted label
    pred = misogyny_dict[index]

    return pred

# Dialect detection model

In [232]:
BERT_MODEL_NAME = 'alger-ia/dziribert'
class TweetModule(pl.LightningModule):
  def __init__(self, n_classes):
    super().__init__()
    self.bert = BertModel.from_pretrained(BERT_MODEL_NAME)
    self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
    self.criterion = nn.CrossEntropyLoss()

  def forward(self, input_ids, attention_mask, labels=None):
    output = self.bert(input_ids, attention_mask)
    output = self.classifier(output.pooler_output)                    
    # if provided with labels return loss and output
    if labels is not None:
      loss = self.criterion(output, labels)
      return loss, output 
    return output

In [233]:
tokenizer_dialect = BertTokenizerFast.from_pretrained(BERT_MODEL_NAME)

# load the model
dialect_dict = {0: 'lebanon', 1: 'egypt', 2: 'morocco', 3: 'tunisia', 4: 'algeria', 5: 'qatar', 6: 'iraq', 7: 'saudi arabia', 8: 'libya', 9: 'jordan'}

dialect_model = TweetModule(10)
dialect_model.load_state_dict(torch.load('../models/dialect_classifier.pt'))

Some weights of the model checkpoint at alger-ia/dziribert were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at alger-ia/dziribert and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You s

<All keys matched successfully>

In [234]:
def predict_dialect(review_text,model,device,tokenizer):

    model.to(device)
    
    encoded_review = tokenizer.encode_plus(
    review_text,
    max_length=123,
    add_special_tokens=True,
    return_token_type_ids=False,
    padding='longest',
    return_attention_mask=True,
    return_tensors='pt',
    )

    input_ids = encoded_review['input_ids'].to(device)
    attention_mask = encoded_review['attention_mask'].to(device)
    output = model(input_ids, attention_mask)
    _, prediction = torch.max(output, dim=1)
    #print(f'Review text: {review_text}')
    index = output.cpu().data.numpy().argmax()
    #print(f'Sentiment  : {index}')
    pred = dialect_dict[index]
    return pred

# Main Prediction

In [235]:
# Main prediction function

def predict(text):
    # clean text
    text = cleaning_content(text)
    
    # predict using offensive model
    off_pred = predict_off(text,off_model,device,tokenizer)

    if off_pred == 'offensive':
        # predict using racism model
        rac_pred = predict_racism(text,racism_model,device,racism_tokenizer)
        # predict using misogyny model
        misog_pred = predict_misogyny(text,misogyny_model,device,misogyny_tokenizer)
        # predict using verbal abuse model
        ver_pred = predict_verbal_abuse(text,verbal_abuse_model,device,verbal_abuse_tokenizer)
        # predict using dialect model
        dialect_pred = predict_dialect(text,dialect_model,device,tokenizer_dialect)
        # predict using religion hate model
        Religion_Hate_pred = Religion_Hate_predict(text,Religion_Hate_model,device,Religion_Hate_tokenizer)
        # return the prediction
        return {"Offensiveness": off_pred, "Dialect": dialect_pred, "Misogyny": misog_pred, "Racism": rac_pred, "Verbal Abuse": ver_pred, "Religion Hate": Religion_Hate_pred}
    
    # predict using misogyny model
    misog_pred = predict_misogyny(text,misogyny_model,device,misogyny_tokenizer)

    # predict using dialect model
    dialect_pred = predict_dialect(text,dialect_model,device,tokenizer_dialect)
    
    # return the prediction  as a dataframe row
    return {"Offensiveness": off_pred, "Dialect": dialect_pred, "Misogyny": misog_pred, "Racism": "Not_Racism", "Verbal Abuse": "Not Verbal Abuse", "Religion Hate": "Not Religion Hate"}

In [236]:
# test
text = 'كل العرب يحبون السعودية'

predict(text)


{'Offensiveness': 'non_offensive',
 'Dialect': 'tunisia',
 'Misogyny': 'non_misogyny',
 'Racism': 'Not_Racism',
 'Verbal Abuse': 'Not Verbal Abuse',
 'Religion Hate': 'Not Religion Hate'}

In [237]:
text = 'اذهب لبلدك يا اسود أنت و شعبك المعفن'

predict(text)

{'Offensiveness': 'offensive',
 'Dialect': 'algeria',
 'Misogyny': 'misogyny',
 'Racism': 'Racism',
 'Verbal Abuse': 'Verbal Abuse',
 'Religion Hate': 'Not Religion Hate'}

In [238]:
text = 'هنالك قط اسود في المنزل'

# predict
predict(text)

{'Offensiveness': 'non_offensive',
 'Dialect': 'tunisia',
 'Misogyny': 'non_misogyny',
 'Racism': 'Not_Racism',
 'Verbal Abuse': 'Not Verbal Abuse',
 'Religion Hate': 'Not Religion Hate'}

In [239]:
text = 'موتي يا حمارة'

predict(text)

{'Offensiveness': 'offensive',
 'Dialect': 'tunisia',
 'Misogyny': 'misogyny',
 'Racism': 'Not_Racism',
 'Verbal Abuse': 'Verbal Abuse',
 'Religion Hate': 'Not Religion Hate'}

In [240]:
text = 'نيك امك يا ساقطة'

predict(text)

{'Offensiveness': 'offensive',
 'Dialect': 'egypt',
 'Misogyny': 'misogyny',
 'Racism': 'Not_Racism',
 'Verbal Abuse': 'Verbal Abuse',
 'Religion Hate': 'Not Religion Hate'}

In [241]:
text = 'انتي مستواك لا يسمح لك ان تكوني في هادا المكان'

predict(text)

{'Offensiveness': 'non_offensive',
 'Dialect': 'algeria',
 'Misogyny': 'non_misogyny',
 'Racism': 'Not_Racism',
 'Verbal Abuse': 'Not Verbal Abuse',
 'Religion Hate': 'Not Religion Hate'}

In [242]:
text = 'اذا عربت خربت اينما حل الاعراب حل الحراب'

predict(text)

{'Offensiveness': 'offensive',
 'Dialect': 'tunisia',
 'Misogyny': 'misogyny',
 'Racism': 'Racism',
 'Verbal Abuse': 'Verbal Abuse',
 'Religion Hate': 'Religion Hate'}

In [243]:
text = 'السعوديه هي اطهر ارض وشعب واحقر شعب'

predict(text)

{'Offensiveness': 'offensive',
 'Dialect': 'iraq',
 'Misogyny': 'non_misogyny',
 'Racism': 'Racism',
 'Verbal Abuse': 'Verbal Abuse',
 'Religion Hate': 'Not Religion Hate'}

In [244]:
text = 'سوريا اجمل بلد في العالم'

predict(text)

{'Offensiveness': 'non_offensive',
 'Dialect': 'iraq',
 'Misogyny': 'non_misogyny',
 'Racism': 'Not_Racism',
 'Verbal Abuse': 'Not Verbal Abuse',
 'Religion Hate': 'Not Religion Hate'}

In [245]:
text = 'ان الله غفور رحيم كلنا عندنا عيوبنا داك الشماته اللي صور الفيديو سير الله يفضحك دنيا واخيره'

predict(text)

{'Offensiveness': 'offensive',
 'Dialect': 'morocco',
 'Misogyny': 'non_misogyny',
 'Racism': 'Not_Racism',
 'Verbal Abuse': 'Verbal Abuse',
 'Religion Hate': 'Religion Hate'}

In [246]:
text = 'ام نص لسان ابكي بترتاحي'

predict(text)

{'Offensiveness': 'offensive',
 'Dialect': 'saudi arabia',
 'Misogyny': 'misogyny',
 'Racism': 'Not_Racism',
 'Verbal Abuse': 'Not Verbal Abuse',
 'Religion Hate': 'Religion Hate'}

In [247]:
text = 'خسات نجس المملكه العربيه السعوديه الاسلاميه قويه بتوحيدها وقادهعلي ردع تسول نفسه اللعب امنها والسعوديه ايران الرخوه لتطلب مسانده احد'

predict(text)

{'Offensiveness': 'offensive',
 'Dialect': 'iraq',
 'Misogyny': 'non_misogyny',
 'Racism': 'Not_Racism',
 'Verbal Abuse': 'Not Verbal Abuse',
 'Religion Hate': 'Not Religion Hate'}

In [248]:
text = 'اساسا انتم تدرون ان سلاطين اراضيكم الجاهليه ماضغطونا اظن والله اعلم ان كلمه ساضغط مشتقه ضغط الجزيره مرتفع وظهرت بسببه الكلمه'

predict(text)

{'Offensiveness': 'offensive',
 'Dialect': 'egypt',
 'Misogyny': 'non_misogyny',
 'Racism': 'Racism',
 'Verbal Abuse': 'Not Verbal Abuse',
 'Religion Hate': 'Not Religion Hate'}

# Visualization

In [249]:
# load test data
test_data = pd.read_csv('../DataSet/test-data.csv')

# clean the data

test_data['Text'] = test_data['Text'].apply(cleaning_content)

In [250]:
test_data.head()

Unnamed: 0,Text,Label
0,بالله ريحي جمالك وخطي الخوض الشرع جو يهله واله...,
1,تريدين اخذ حقوق المراه خوذيهم بطريقه الصح مش ب...,
2,ناقصه عقل ودين صاحبه المنشور,
3,اي عمل ممكن للمراه اتقانه باكثر كفاءه دقه جوده...,
4,الشي الوحيد تمهر المرٱه الرجل العمل الكوجينه ت...,


In [251]:
# predict the data


#empty dataframe
df = pd.DataFrame(columns=['Offensiveness', 'Dialect', 'Misogyny', 'Racism', 'Verbal Abuse', 'Religion Hate'])

# loop over the data
for i in range(len(test_data)):
    # get the text
    text = test_data['Text'][i]
    # predict and create a dataframe row
    row = predict(text)
    # append the row to the dataframe using concat
    df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)


df.head()


Unnamed: 0,Offensiveness,Dialect,Misogyny,Racism,Verbal Abuse,Religion Hate
0,non_offensive,tunisia,non_misogyny,Not_Racism,Not Verbal Abuse,Not Religion Hate
1,offensive,tunisia,misogyny,Not_Racism,Not Verbal Abuse,Not Religion Hate
2,offensive,tunisia,misogyny,Not_Racism,Verbal Abuse,Not Religion Hate
3,non_offensive,saudi arabia,non_misogyny,Not_Racism,Not Verbal Abuse,Not Religion Hate
4,non_offensive,iraq,non_misogyny,Not_Racism,Not Verbal Abuse,Not Religion Hate


In [252]:
test_data = pd.concat([test_data, df], axis=1)

test_data.head()

Unnamed: 0,Text,Label,Offensiveness,Dialect,Misogyny,Racism,Verbal Abuse,Religion Hate
0,بالله ريحي جمالك وخطي الخوض الشرع جو يهله واله...,,non_offensive,tunisia,non_misogyny,Not_Racism,Not Verbal Abuse,Not Religion Hate
1,تريدين اخذ حقوق المراه خوذيهم بطريقه الصح مش ب...,,offensive,tunisia,misogyny,Not_Racism,Not Verbal Abuse,Not Religion Hate
2,ناقصه عقل ودين صاحبه المنشور,,offensive,tunisia,misogyny,Not_Racism,Verbal Abuse,Not Religion Hate
3,اي عمل ممكن للمراه اتقانه باكثر كفاءه دقه جوده...,,non_offensive,saudi arabia,non_misogyny,Not_Racism,Not Verbal Abuse,Not Religion Hate
4,الشي الوحيد تمهر المرٱه الرجل العمل الكوجينه ت...,,non_offensive,iraq,non_misogyny,Not_Racism,Not Verbal Abuse,Not Religion Hate


In [253]:
from geopy.geocoders import Nominatim
import folium

geolocator = Nominatim(user_agent="NLP")

def geolocate(country):
    try:
        # Geolocate the center of the country
        loc = geolocator.geocode(country)
        # And return latitude and longitude
        return (loc.latitude, loc.longitude)
    except:
        # Return missing value
        return np.nan

In [254]:
# get the location of each country
test_data['Location'] = test_data['Dialect'].apply(lambda x: geolocate(x))

In [255]:
test_data.head()

Unnamed: 0,Text,Label,Offensiveness,Dialect,Misogyny,Racism,Verbal Abuse,Religion Hate,Location
0,بالله ريحي جمالك وخطي الخوض الشرع جو يهله واله...,,non_offensive,tunisia,non_misogyny,Not_Racism,Not Verbal Abuse,Not Religion Hate,"(33.8439408, 9.400138)"
1,تريدين اخذ حقوق المراه خوذيهم بطريقه الصح مش ب...,,offensive,tunisia,misogyny,Not_Racism,Not Verbal Abuse,Not Religion Hate,"(33.8439408, 9.400138)"
2,ناقصه عقل ودين صاحبه المنشور,,offensive,tunisia,misogyny,Not_Racism,Verbal Abuse,Not Religion Hate,"(33.8439408, 9.400138)"
3,اي عمل ممكن للمراه اتقانه باكثر كفاءه دقه جوده...,,non_offensive,saudi arabia,non_misogyny,Not_Racism,Not Verbal Abuse,Not Religion Hate,"(25.6242618, 42.3528328)"
4,الشي الوحيد تمهر المرٱه الرجل العمل الكوجينه ت...,,non_offensive,iraq,non_misogyny,Not_Racism,Not Verbal Abuse,Not Religion Hate,"(33.0955793, 44.1749775)"


In [256]:
# Group by dialect and count the number of tweets with offensiveness = offensive
dialect_off = test_data.groupby('Dialect')['Offensiveness'].value_counts().unstack().fillna(0)['offensive'] if 'offensive' in test_data.groupby('Dialect')['Offensiveness'].value_counts().unstack().columns else 0

misogyny_off = test_data.groupby('Dialect')['Misogyny'].value_counts().unstack().fillna(0)['misogyny'] if 'misogyny' in test_data.groupby('Dialect')['Misogyny'].value_counts().unstack().columns else 0

racism_off = test_data.groupby('Dialect')['Racism'].value_counts().unstack().fillna(0)['Racism'] if 'Racism' in test_data.groupby('Dialect')['Racism'].value_counts().unstack().columns else 0

verbal_off = test_data.groupby('Dialect')['Verbal Abuse'].value_counts().unstack().fillna(0)['Verbal Abuse'] if 'Verbal Abuse' in test_data.groupby('Dialect')['Verbal Abuse'].value_counts().unstack().columns else 0

religion_off = test_data.groupby('Dialect')['Religion Hate'].value_counts().unstack().fillna(0)['Religion Hate'] if 'Religion Hate' in test_data.groupby('Dialect')['Religion Hate'].value_counts().unstack().columns else 0

# sym of tweets for each dialect
dialect_sym = test_data.groupby('Dialect')['Text'].count()

# create a dataframe
dialect_df = pd.DataFrame({'Total text' : dialect_sym,'Offensiveness':dialect_off, 'Racism':racism_off, 'Verbal Abuse':verbal_off, 'Religion Hate':religion_off, 'Misogyny':misogyny_off, 'Location':test_data.groupby('Dialect')['Location'].first()})

dialect_df

Unnamed: 0_level_0,Total text,Offensiveness,Racism,Verbal Abuse,Religion Hate,Misogyny,Location
Dialect,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
algeria,2,0.0,0.0,0.0,0,1.0,"(28.0000272, 2.9999825)"
egypt,4,3.0,0.0,2.0,0,2.0,"(26.2540493, 29.2675469)"
iraq,2,0.0,0.0,0.0,0,0.0,"(33.0955793, 44.1749775)"
libya,16,5.0,2.0,3.0,0,6.0,"(26.8234472, 18.1236723)"
morocco,1,1.0,0.0,0.0,0,0.0,"(31.1728205, -7.3362482)"
saudi arabia,3,0.0,0.0,0.0,0,0.0,"(25.6242618, 42.3528328)"
tunisia,9,3.0,0.0,1.0,0,5.0,"(33.8439408, 9.400138)"


In [257]:
dialect_df['Total text'][0]

2

In [258]:
# Create map zoomed in on northern Africa and the Middle East
m = folium.Map(location=[27, 48], zoom_start=4, tiles='OpenStreetMap')

# Add a circe for each dialect with the number of tweets as radius
for i in range(len(dialect_df) - 1):
    # get the location
    location = dialect_df['Location'][i]
    # get the number of tweets
    radius = dialect_df['Total text'][i] * 2

    Iframe = folium.IFrame(html=f'<b> Number of tweets: </b> {dialect_df["Total text"][i]},  <br> <b>The number of offensive tweets:</b> {dialect_df["Offensiveness"][i]}, <br> <b>The number of racism tweets:</b> {dialect_df["Racism"][i]}, <br> <b>The number of verbal abuse tweets:</b> {dialect_df["Verbal Abuse"][i]}, <br> <b>The number of religion hate tweets:</b> {dialect_df["Religion Hate"][i]}, <br> <b> The number of misogyny tweets:</b> {dialect_df["Misogyny"][i]}', width=300, height=150)
    popup = folium.Popup(Iframe, max_width=300)
    
    # add a Marker with a popup object
    folium.Marker(location= location, popup= popup).add_to(m)


# show the map
m