# Visualisation using map

## Visualise each text with the type of hate speech based on location of the text dialect

In [1]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(torch.cuda.get_device_name(device))

NVIDIA GeForce GTX 1650


In [2]:
import numpy as np
import pandas as pd
# pandas sceen width
pd.set_option('display.max_colwidth', None)
import re
import string
import random
import time
import gc
import pickle

import nltk
nltk.download('stopwords')
import emoji

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn.functional as F

import pytorch_lightning as pl

from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup, BertTokenizerFast

from sklearn.metrics import accuracy_score, roc_curve, auc
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\makch\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
arabic_stopwords = set(nltk.corpus.stopwords.words("arabic"))

arabic_diacritics = re.compile("""
                             ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)

arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
english_punctuations = string.punctuation
punctuations = arabic_punctuations + english_punctuations


def remove_urls (text):
    text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE)
    return text


def remove_emails(text):
    text = re.sub(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", "",  text, flags=re.MULTILINE)
    return text

# def remove_emoji(text):
#     return emoji.get_emoji_regexp().sub(u'', text)

def remove_emoji(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

def normalization(text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)
    return text

def remove_diacritics(text):
    text = re.sub(arabic_diacritics, '', text)
    return text

def remove_stopwords(text):
    filtered_sentence = [w for w in text.split() if not w in arabic_stopwords]
    return ' '.join(filtered_sentence)

def cleaning_content(line):
    if (isinstance(line, float)):
        return None
    line.replace('\n', ' ')
    line = remove_emails(line)
    line = remove_urls(line)
    line = remove_emoji(line)
    nline = [w if '@' not in w else 'USERID' for w in line.split()]
    line = ' '.join(nline)
    line = line.replace('RT', '').replace('<LF>', '').replace('<br />','').replace('&quot;', '').replace('<url>', '').replace('USERID', '')


    # add spaces between punc,
    line = line.translate(str.maketrans({key: " {0} ".format(key) for key in punctuations}))

    # then remove punc,
    translator = str.maketrans('', '', punctuations)
    line = line.translate(translator)

    line = remove_stopwords(line)
    line=remove_diacritics(normalization(line))
    return line

def hasDigits(s):
    return any( 48 <= ord(char) <= 57  or 1632 <= ord(char) <= 1641 for char in s)


In [4]:
BERT_MODEL_NAME = 'alger-ia/dziribert'
class TweetModule(pl.LightningModule):
  def __init__(self, n_classes):
    super().__init__()
    self.bert = BertModel.from_pretrained(BERT_MODEL_NAME)
    self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
    self.criterion = nn.CrossEntropyLoss()

  def forward(self, input_ids, attention_mask, labels=None):
    output = self.bert(input_ids, attention_mask)
    output = self.classifier(output.pooler_output)                    
    # if provided with labels return loss and output
    if labels is not None:
      loss = self.criterion(output, labels)
      return loss, output 
    return output


In [5]:
# load the model
dialect_dict = {0: 'lebanon', 1: 'egypt', 2: 'morocco', 3: 'tunisia', 4: 'algeria', 5: 'qatar', 6: 'iraq', 7: 'saudi arabia', 8: 'libya', 9: 'jordan'}

dialect_model = TweetModule(10)
dialect_model.load_state_dict(torch.load('../models/dialect_classifier.pt'))

Some weights of the model checkpoint at alger-ia/dziribert were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at alger-ia/dziribert and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You s

<All keys matched successfully>

In [6]:
class BertClassifier(nn.Module):
    """Bert Model for Classification Tasks.
    """
    def __init__(self, freeze_bert=False):
        """
        @param    bert: a BertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        """
        super(BertClassifier, self).__init__()
        #  hidden size of BERT, hidden size of our classifier, number of labels
        D_in, H, D_out = 768, 50, 2

        # Instantiate BERT model
        self.bert = BertModel.from_pretrained('aubmindlab/bert-base-arabertv02')

        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(H, D_out)
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                      max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                      num_labels)
        """

        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)  
        
        # Extract the last hidden state of the token `[CLS]` for classification task and feed them to classifier to compute logits 
        last_hidden_state_cls = outputs[0][:, 0, :]
        logits = self.classifier(last_hidden_state_cls)

        return logits

In [7]:
# load the model
off_dict = {1: 'offensive', 0: 'non_offensive'}

off_model = BertClassifier()
off_model.load_state_dict(torch.load('../models/modelv3.pt'))

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02 were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [8]:
tokenizer_off = BertTokenizer.from_pretrained('aubmindlab/bert-base-arabertv02', do_lower_case=True)
tokenizer_dialect = BertTokenizerFast.from_pretrained(BERT_MODEL_NAME)
MAX_LEN =  256

def predict_off(review_text,model,device,tokenizer):
    encoded_review = tokenizer.encode_plus(
    review_text,
    max_length=MAX_LEN,
    add_special_tokens=True,
    return_token_type_ids=False,
    padding='longest',
    return_attention_mask=True,
    return_tensors='pt',
    )

    input_ids = encoded_review['input_ids'].to(device)
    attention_mask = encoded_review['attention_mask'].to(device)
    output = model(input_ids, attention_mask)
    _, prediction = torch.max(output, dim=1)
    #print(f'Review text: {review_text}')
    index = output.cpu().data.numpy().argmax()
    #print(f'Sentiment  : {index}')
    return index

In [9]:
def predict_dialect(review_text,model,device,tokenizer):
    encoded_review = tokenizer.encode_plus(
    review_text,
    max_length=123,
    add_special_tokens=True,
    return_token_type_ids=False,
    padding='longest',
    return_attention_mask=True,
    return_tensors='pt',
    )

    input_ids = encoded_review['input_ids'].to(device)
    attention_mask = encoded_review['attention_mask'].to(device)
    output = model(input_ids, attention_mask)
    _, prediction = torch.max(output, dim=1)
    #print(f'Review text: {review_text}')
    index = output.cpu().data.numpy().argmax()
    #print(f'Sentiment  : {index}')
    return index

In [10]:
# load test data
test_data = pd.read_csv('../DataSet/test-data.csv')

# clean the data

test_data['Text'] = test_data['Text'].apply(cleaning_content)

In [11]:
model = off_model  
model = model.to(device)

# get predictions for test data
y_pred = []
for i in range(len(test_data)):
    review_text = test_data['Text'][i]
    y_pred.append(predict_off(review_text,model,device,tokenizer_off))

# set Label column with predictions
test_data['Label'] = y_pred


# transform Label column to string
test_data['Label'] = test_data['Label'].apply(lambda x: 'offensive' if x == 1 else 'not_offensive')

test_data.head()

Unnamed: 0,Text,Label
0,بالله ريحي جمالك وخطي الخوض الشرع جو يهله والهي بمجتمعكن الماساوي اللي مسمياته نسوي,not_offensive
1,تريدين اخذ حقوق المراه خوذيهم بطريقه الصح مش بطريقه الجاهليه المتخلفه الي مافيهاش منطق صح نساء مظلومات وهك وسح مجتمعنا اليبي متشدد بلنسبه صح ميصيرش المرا الراجل الرجل ابمراه وهذي حقيقه مش هذاك الرجال جيتي ودرتي جو ناخذ حقي بكلام مش منطقي تاخذي حقك وحق النساء خوذيه بطريقه منطقيه وبما الله وانتي تقولي النساء الرجال والخخخ المهم فكري بتاخذي حقك مش تقعدي مواقع تواصل تقولي كلام ملاش معني والناس يضحكو وعلي كلامك وعلي تفكيرك المعكوس والغريب,offensive
2,ناقصه عقل ودين صاحبه المنشور,offensive
3,اي عمل ممكن للمراه اتقانه باكثر كفاءه دقه جوده الشي عمره صعب عليها ابدا,not_offensive
4,الشي الوحيد تمهر المرٱه الرجل العمل الكوجينه تنظيف البيوت اما الباقي تصل لظفر الرجل الوقت الحالي,not_offensive


In [12]:
model2 = dialect_model
model2 = model2.to(device)
# get predictions for test data
y_pred2 = []
for i in range(len(test_data)):
    review_text = test_data['Text'][i]
    y_pred2.append(predict_dialect(review_text,model2,device,tokenizer_dialect))

# set Dialect column with predictions
test_data['Dialect'] = y_pred2

# transform Dialect column to string from dialect_dict
test_data['Dialect'] = test_data['Dialect'].apply(lambda x: dialect_dict[x])

test_data.head()


Unnamed: 0,Text,Label,Dialect
0,بالله ريحي جمالك وخطي الخوض الشرع جو يهله والهي بمجتمعكن الماساوي اللي مسمياته نسوي,not_offensive,tunisia
1,تريدين اخذ حقوق المراه خوذيهم بطريقه الصح مش بطريقه الجاهليه المتخلفه الي مافيهاش منطق صح نساء مظلومات وهك وسح مجتمعنا اليبي متشدد بلنسبه صح ميصيرش المرا الراجل الرجل ابمراه وهذي حقيقه مش هذاك الرجال جيتي ودرتي جو ناخذ حقي بكلام مش منطقي تاخذي حقك وحق النساء خوذيه بطريقه منطقيه وبما الله وانتي تقولي النساء الرجال والخخخ المهم فكري بتاخذي حقك مش تقعدي مواقع تواصل تقولي كلام ملاش معني والناس يضحكو وعلي كلامك وعلي تفكيرك المعكوس والغريب,offensive,tunisia
2,ناقصه عقل ودين صاحبه المنشور,offensive,tunisia
3,اي عمل ممكن للمراه اتقانه باكثر كفاءه دقه جوده الشي عمره صعب عليها ابدا,not_offensive,saudi arabia
4,الشي الوحيد تمهر المرٱه الرجل العمل الكوجينه تنظيف البيوت اما الباقي تصل لظفر الرجل الوقت الحالي,not_offensive,iraq


In [13]:
# ! pip3 install geopy

In [14]:
from geopy.geocoders import Nominatim
import folium

geolocator = Nominatim(user_agent="NLP")

def geolocate(country):
    try:
        # Geolocate the center of the country
        loc = geolocator.geocode(country)
        # And return latitude and longitude
        return (loc.latitude, loc.longitude)
    except:
        # Return missing value
        return np.nan

In [15]:
# get the location of each country
test_data['Location'] = test_data['Dialect'].apply(geolocate)

test_data.head()


Unnamed: 0,Text,Label,Dialect,Location
0,بالله ريحي جمالك وخطي الخوض الشرع جو يهله والهي بمجتمعكن الماساوي اللي مسمياته نسوي,not_offensive,tunisia,"(33.8439408, 9.400138)"
1,تريدين اخذ حقوق المراه خوذيهم بطريقه الصح مش بطريقه الجاهليه المتخلفه الي مافيهاش منطق صح نساء مظلومات وهك وسح مجتمعنا اليبي متشدد بلنسبه صح ميصيرش المرا الراجل الرجل ابمراه وهذي حقيقه مش هذاك الرجال جيتي ودرتي جو ناخذ حقي بكلام مش منطقي تاخذي حقك وحق النساء خوذيه بطريقه منطقيه وبما الله وانتي تقولي النساء الرجال والخخخ المهم فكري بتاخذي حقك مش تقعدي مواقع تواصل تقولي كلام ملاش معني والناس يضحكو وعلي كلامك وعلي تفكيرك المعكوس والغريب,offensive,tunisia,"(33.8439408, 9.400138)"
2,ناقصه عقل ودين صاحبه المنشور,offensive,tunisia,"(33.8439408, 9.400138)"
3,اي عمل ممكن للمراه اتقانه باكثر كفاءه دقه جوده الشي عمره صعب عليها ابدا,not_offensive,saudi arabia,"(25.6242618, 42.3528328)"
4,الشي الوحيد تمهر المرٱه الرجل العمل الكوجينه تنظيف البيوت اما الباقي تصل لظفر الرجل الوقت الحالي,not_offensive,iraq,"(33.0955793, 44.1749775)"


In [26]:
test_data[test_data['Dialect'] == 'morocco']

Unnamed: 0,Text,Label,Dialect,Location
31,سواء راجل او مراه الواحد مايتمناش الشر للناس,offensive,morocco,"(31.1728205, -7.3362482)"


In [17]:
test_data.Dialect.value_counts()

libya           16
tunisia          9
egypt            4
saudi arabia     3
iraq             2
algeria          2
morocco          1
Name: Dialect, dtype: int64

In [18]:
# Group by dialect and count the number of offensive tweets for each dialect if not exist set count to 0
dialect_off = test_data.groupby(['Dialect','Label']).size().reset_index(name='Count')
dialect_off = dialect_off.pivot(index='Dialect', columns='Label', values='Count').reset_index()
dialect_off = dialect_off.fillna(0)



# add location column
dialect_off['Location'] = dialect_off['Dialect'].apply(geolocate)


dialect_off.head(10)

Label,Dialect,not_offensive,offensive,Location
0,algeria,2.0,0.0,"(28.0000272, 2.9999825)"
1,egypt,1.0,3.0,"(26.2540493, 29.2675469)"
2,iraq,2.0,0.0,"(33.0955793, 44.1749775)"
3,libya,11.0,5.0,"(26.8234472, 18.1236723)"
4,morocco,0.0,1.0,"(31.1728205, -7.3362482)"
5,saudi arabia,3.0,0.0,"(25.6242618, 42.3528328)"
6,tunisia,6.0,3.0,"(33.8439408, 9.400138)"


In [19]:
# Create map zoomed in on northern Africa and the Middle East
m = folium.Map(location=[27, 48], zoom_start=4, tiles='OpenStreetMap')

# add a circle to the map with the size indicating the number of offensive tweets
for i in range(len(dialect_off) - 1):
    # get the location of the dialect
    location = dialect_off['Location'][i]
    # get the number of offensive tweets
    count = dialect_off['not_offensive'][i] + dialect_off['offensive'][i]
    # add a circle to the map
    folium.CircleMarker(location=location, radius=count, color='red', fill=True, fill_color='red').add_to(m)
    # add a popup to the map with the dialect name and statistics of offensive tweets and non offensive tweets
    folium.Marker(location=location, popup=dialect_off['Dialect'][i] + ' - ' + 'Offensive: ' + str(dialect_off['offensive'][i]) + ' - ' + 'Non Offensive: ' + str(dialect_off['not_offensive'][i])).add_to(m)

# show the map
m