In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [8]:
data = pd.read_csv('../DataSet/offensive_dataset.csv', encoding='utf-16')
data.head()

Unnamed: 0,Comment,is_off
0,ÿ¥ŸàŸÅ ÿ≠ÿØ ŸäŸÜ*** ÿ£ŸÜÿ™ Ÿà ÿ•Ÿäÿ±ÿßŸÜ.,1
1,ÿ¥ŸàŸÅ ÿ™ŸäŸÅŸä ŸÇŸàÿØÿßÿ™Ÿáÿß ŸÖÿπ ÿßŸÑÿ¨ŸÜ ÿ∫Ÿä ÿ®ŸÑÿßÿ™Ÿä Ÿäÿ¨Ÿä ŸÖŸÜ ŸÖŸÉÿ©,0
2,ÿπÿ¨ÿ® Ÿäÿ™ŸÅÿßŸàÿ∏ŸàŸÜ ÿπŸÑŸâ ÿ¥ÿπŸàÿ® ÿπÿ±ÿ®Ÿäÿ© ŸàÿπŸÑŸäŸáŸÖ ŸàŸáŸâŸÑÿß ÿ™ÿπŸÑŸÖ,0
3,ŸáŸáŸáŸáŸáŸáŸá ŸáÿßÿØ ÿ¥ŸàŸÅ ÿ™ŸäŸÅŸä ÿßŸÑŸÑŸä ÿ≠ÿ∑ÿ™ ÿπŸÑŸäŸá ŸÖÿ¥ÿß ŸÅŸäŸáÿß,0
4,ÿßŸÑÿ±ÿ≤ŸÇ Ÿäÿ£ÿ™Ÿä ŸÖŸÜ ÿ≠Ÿäÿ´ ŸÑÿß ÿ™ÿØÿ±Ÿä üôåüèª‚ù§Ô∏è,0


In [9]:
# count of target classes
data['is_off'].value_counts()


1    7000
0    7000
Name: is_off, dtype: int64

In [10]:
# get the data with target = 1 (offensive)
offensive_data = data[data['is_off'] == 1]
offensive_data.head()


Unnamed: 0,Comment,is_off
0,ÿ¥ŸàŸÅ ÿ≠ÿØ ŸäŸÜ*** ÿ£ŸÜÿ™ Ÿà ÿ•Ÿäÿ±ÿßŸÜ.,1
5,ÿØÿÆŸÑŸà ÿ≥ŸàŸÇ ŸÉÿ±ŸÉŸÖ \nÿ¥ŸàŸÅŸà ÿ¥ŸÉŸàŸÜ ŸÖŸÉÿ™ÿ® ÿπŸÑŸäŸÉŸÖ ÿ™ÿπŸäÿ¥Ÿà ÿπÿ®Ÿä...,1
6,@User.IDX ŸÑÿ≥ÿ≥Ÿá ÿßŸÖÿ®ÿßÿ±ÿ≠ ÿ®ÿ≠ÿ∞ÿ±ŸÉŸà ŸÖŸÜ ŸàŸÑÿßÿØ ÿßŸÑÿ¨ÿ≤ŸÖŸá ŸÅŸä...,1
8,ÿßŸÑŸÑŸá ŸäÿπŸÅŸà ÿπŸÑŸäŸáÿß ŸÖÿ≥ŸÉŸäŸÜÿ© ÿØÿ∫Ÿäÿß ÿ∑ÿßÿ±Ÿà ÿ®Ÿáÿß ÿ≥ÿµÿ≠ÿßÿ® ÿßŸÑÿ≠...,1
9,ÿ™ŸÅŸà ÿ¥ÿ≠ÿßÿßÿßŸÑ ÿ≠ÿßŸÖÿ∂ÿ© ŸáÿßÿØ ÿ≥ÿπŸäŸäŸäÿØÿ©...ŸÖÿπÿ±ŸÅÿ™ ÿπŸÑÿßŸá ŸÉŸäÿ¨Ÿä...,1


In [11]:
import string
import re
import nltk

In [12]:
arabic_stopwords = set(nltk.corpus.stopwords.words("arabic"))

arabic_diacritics = re.compile("""
                             Ÿë    | # Tashdid
                             Ÿé    | # Fatha
                             Ÿã    | # Tanwin Fath
                             Ÿè    | # Damma
                             Ÿå    | # Tanwin Damm
                             Ÿê    | # Kasra
                             Ÿç    | # Tanwin Kasr
                             Ÿí    | # Sukun
                             ŸÄ     # Tatwil/Kashida
                         """, re.VERBOSE)

arabic_punctuations = '''`√∑√óÿõ<>_()*&^%][ŸÄÿå/:"ÿü.,'{}~¬¶+|!‚Äù‚Ä¶‚Äú‚ÄìŸÄ'''
english_punctuations = string.punctuation
punctuations = arabic_punctuations + english_punctuations


def remove_urls (text):
    text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE)
    return text


def remove_emails(text):
    text = re.sub(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", "",  text, flags=re.MULTILINE)
    return text

# def remove_emoji(text):
#     return emoji.get_emoji_regexp().sub(u'', text)

def remove_emoji(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

def normalization(text):
    text = re.sub("[ÿ•ÿ£ÿ¢ÿß]", "ÿß", text)
    text = re.sub("Ÿâ", "Ÿä", text)
    text = re.sub("ÿ§", "ÿ°", text)
    text = re.sub("ÿ¶", "ÿ°", text)
    text = re.sub("ÿ©", "Ÿá", text)
    text = re.sub("⁄Ø", "ŸÉ", text)
    return text

def remove_diacritics(text):
    text = re.sub(arabic_diacritics, '', text)
    return text

def remove_stopwords(text):
    filtered_sentence = [w for w in text.split() if not w in arabic_stopwords]
    return ' '.join(filtered_sentence)

def cleaning_content(line):
    if (isinstance(line, float)):
        return None
    line.replace('\n', ' ')
    line = remove_emails(line)
    line = remove_urls(line)
    line = remove_emoji(line)
    nline = [w if '@' not in w else 'USERID' for w in line.split()]
    line = ' '.join(nline)
    line = line.replace('RT', '').replace('<LF>', '').replace('<br />','').replace('&quot;', '').replace('<url>', '').replace('USERID', '')


    # add spaces between punc,
    line = line.translate(str.maketrans({key: " {0} ".format(key) for key in punctuations}))

    # then remove punc,
    translator = str.maketrans('', '', punctuations)
    line = line.translate(translator)

    line = remove_stopwords(line)
    line=remove_diacritics(normalization(line))
    return line

def hasDigits(s):
    return any( 48 <= ord(char) <= 57  or 1632 <= ord(char) <= 1641 for char in s)


In [13]:
offensive_data.Comment = offensive_data.Comment.apply(cleaning_content)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  offensive_data.Comment = offensive_data.Comment.apply(cleaning_content)


In [26]:
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words=arabic_stopwords)
dtm = cv.fit_transform(offensive_data['Comment'])

feature_names = cv.get_feature_names()
print(len(feature_names))



11669




In [28]:
LDA_model = LatentDirichletAllocation(n_components= 5, random_state=42 , max_iter=100, n_jobs=-1)

LDA_model.fit(dtm)

LatentDirichletAllocation(max_iter=100, n_components=5, n_jobs=-1,
                          random_state=42)

In [36]:
for i, topic in enumerate(LDA_model.components_):
    print("THE TOP {} WORDS FOR TOPIC #{}".format(20, i))
    print([cv.get_feature_names()[index] for index in topic.argsort()[-20:]])
    print("\n")

THE TOP 20 WORDS FOR TOPIC #0




['ŸÇÿ®ÿ≠', 'ÿßŸÑÿ≠ŸÇ', 'ÿßŸÑÿ®ÿ±ÿ™Ÿàÿ¥', 'ÿ™ÿπÿßŸÑŸä', 'ÿØŸäÿßŸÑ', 'ÿßŸà', 'ŸÇÿßŸÑ', 'ŸäŸàŸÖ', 'ŸäŸÇŸàŸÑ', 'ÿ≠ŸàŸÑ', 'ŸÇŸÜÿßŸá', 'ŸÇŸàŸá', 'ŸàÿßŸÑŸÑŸá', 'ÿ®ÿßŸÑŸÑŸá', 'ÿßŸÑŸÜÿßÿ≥', 'ÿßŸÑÿ¨ÿ≤Ÿäÿ±Ÿá', 'ÿßŸÑŸä', 'ÿßŸÜ', 'ÿßŸÑÿß', 'ÿßŸÑŸÑŸá']


THE TOP 20 WORDS FOR TOPIC #1
['ÿπŸÑŸä', 'ŸàÿßŸÑŸÑŸá', 'ÿØÿßÿπÿ¥', 'ÿßŸÑŸäŸÖŸÜ', 'ÿ≥Ÿàÿ±Ÿäÿß', 'ÿßŸà', 'ÿßŸÜŸá', 'ŸÇÿ™ŸÑ', 'ŸäÿπŸÜŸä', 'ÿßŸÜÿ™', 'ÿßŸÑŸä', 'ŸÇŸÜÿßŸá', 'ÿßŸÑÿ≥ÿπŸàÿØŸäŸá', 'ŸÖÿµÿ±', 'ŸÇÿ∑ÿ±', 'ÿßŸÑÿ¥ÿπÿ®', 'ÿßŸÑÿ¨ÿ≤Ÿäÿ±Ÿá', 'ÿßŸäÿ±ÿßŸÜ', 'ÿßŸÑŸÑŸá', 'ÿßŸÜ']


THE TOP 20 WORDS FOR TOPIC #2
['ÿßŸÜ', 'ÿßÿ¥', 'Ÿáÿßÿ∞', 'ÿßŸÑÿÆÿØŸÖŸá', 'ÿØŸàŸÑŸá', 'ŸÖÿ≠ŸÖÿØ', 'ÿßŸÑÿ≥ÿπŸàÿØŸäŸá', 'ÿßÿÆÿ™Ÿä', 'ÿ¨ŸáŸÜŸÖ', 'ÿßŸÑÿØŸàŸÑŸá', 'ÿßŸÑŸÑŸáŸÖ', 'ÿßŸÑÿßÿ≥ŸÑÿßŸÖŸäŸá', 'ÿßŸÑÿß', 'ÿßŸÑÿπÿ±ÿ®', 'ŸÑŸÑŸá', 'ÿßŸÉÿ®ÿ±', 'ÿßŸÑÿßÿ≥ŸÑÿßŸÖ', 'ÿßŸÑŸä', 'ÿßŸà', 'ÿßŸÑŸÑŸá']


THE TOP 20 WORDS FOR TOPIC #3
['ŸÑŸäŸÉ', 'ÿßŸÑŸÑŸä', 'ÿ¥ŸàŸÅ', 'ÿ®ÿßÿ¥', 'ÿßÿ¥', 'ŸàÿßŸÑŸÑŸá', 'ÿßŸÜÿß', 'ÿßŸà', 'ŸÖÿßÿ¥Ÿä', 'ÿØŸäÿßŸÑ', 'ÿ≥Ÿäÿ±', 'Ÿàÿßÿ¥', 'ÿØŸäÿßŸÑŸÉ', 'Ÿäÿπÿ∑ŸäŸÉ', 'ÿ±ÿßŸá', 

In [30]:
final_topic = LDA_model.transform(dtm)

offensive_data['Topic'] = final_topic.argmax(axis=1)

offensive_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  offensive_data['Topic'] = final_topic.argmax(axis=1)


Unnamed: 0,Comment,is_off,Topic
0,ÿ¥ŸàŸÅ ÿ≠ÿØ ÿßŸäÿ±ÿßŸÜ,1,3
5,ÿØÿÆŸÑŸà ÿ≥ŸàŸÇ ŸÉÿ±ŸÉŸÖ ÿ¥ŸàŸÅŸà ÿ¥ŸÉŸàŸÜ ŸÖŸÉÿ™ÿ® ÿπŸÑŸäŸÉŸÖ ÿ™ÿπŸäÿ¥Ÿà ÿπÿ®ŸäÿØ ...,1,2
6,ŸÑÿ≥ÿ≥Ÿá ÿßŸÖÿ®ÿßÿ±ÿ≠ ÿ®ÿ≠ÿ∞ÿ±ŸÉŸà ŸàŸÑÿßÿØ ÿßŸÑÿ¨ÿ≤ŸÖŸá ŸÅŸäŸÅŸä ŸàŸÑÿßÿØ ÿ¥ÿ±ŸÖŸàÿ∑...,1,1
8,ÿßŸÑŸÑŸá ŸäÿπŸÅŸà ÿπŸÑŸäŸáÿß ŸÖÿ≥ŸÉŸäŸÜŸá ÿØÿ∫Ÿäÿß ÿ∑ÿßÿ±Ÿà ÿ≥ÿµÿ≠ÿßÿ® ÿßŸÑÿ≠ÿ≥ŸÜÿßÿ™...,1,3
9,ÿ™ŸÅŸà ÿ¥ÿ≠ÿßÿßÿßŸÑ ÿ≠ÿßŸÖÿ∂Ÿá ŸáÿßÿØ ÿ≥ÿπŸäŸäŸäÿØŸá ŸÖÿπÿ±ŸÅÿ™ ÿπŸÑÿßŸá ŸÉŸäÿ¨Ÿäÿ®Ÿà...,1,3


In [33]:
# select the comments with topic 0
topic_0 = offensive_data[offensive_data['Topic'] == 0]
topic_0.head()


Unnamed: 0,Comment,is_off,Topic
24,,1,0
29,ÿßŸÑÿ™ŸÇŸäŸá ÿØŸäŸÜ ÿßŸÑÿ±ÿßŸÅÿ∂Ÿá ŸÑÿ∞ŸÑŸÉ ÿßŸÑÿπÿ® ÿ∫Ÿäÿ±Ÿáÿß ŸáŸáŸáŸáŸá,1,0
51,ŸÅÿ≥ÿßÿØ ÿµÿßÿ±ÿÆ ÿ≤ÿßÿ°ÿØ ÿ∏ŸÑŸÖ ŸÅÿßÿØÿ≠ ŸäŸÅÿ±ÿÆ ÿßÿ±Ÿáÿßÿ® ÿπŸÅŸàŸä ŸäŸÖŸÉŸÜ ÿ™...,1,0
65,ÿßŸÑŸÑŸá ŸÉÿØÿ® ÿ≠ŸÑŸàŸÅ ÿ≠ŸÑŸàŸÅŸá ÿßŸÑÿ∫Ÿäÿ≥,1,0
106,ÿ®ÿ¥ÿπÿ® Ÿäÿ™ÿπÿßÿ∑ŸÅ ÿ≤ÿßŸÜŸäŸá ŸÇŸàŸÑ ŸÉŸÑŸÜÿß ŸÑŸäŸÑŸä ŸäŸÇÿ®ÿ≠ ÿ≤Ÿàÿ¨Ÿá ŸÖÿ≠ÿßŸÖ...,1,0


In [35]:

topic_0.shape


(716, 3)

In [31]:
import pyLDAvis.sklearn

# Enable the visualization on the notebook
pyLDAvis.enable_notebook()

# Create the panel for the visualization
panel = pyLDAvis.sklearn.prepare(LDA_model, dtm, cv, mds='tsne') 

# Show the panel
panel

  default_term_info = default_term_info.sort_values(
