In [59]:
!pip install cufflinks
import sys
import cufflinks




In [60]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from plotly.offline import init_notebook_mode, plot, iplot
import plotly.express as px
import plotly.graph_objects as go
init_notebook_mode(connected=True)

import cufflinks as cf
cf.go_offline()

from sklearn.feature_extraction.text import CountVectorizer

In [61]:
df = pd.read_csv('cyberbullying_tweets.csv')
df.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying


In [62]:
df['cyberbullying_type'].value_counts()


religion               7998
age                    7992
gender                 7973
ethnicity              7961
not_cyberbullying      7945
other_cyberbullying    7823
Name: cyberbullying_type, dtype: int64

In [63]:
df['cyberbullying_type'].value_counts().iplot(kind="bar")



In [64]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('wordnet')



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [65]:
import re
clean = '@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+'
stop_words = set(stopwords.words('english'))

In [66]:
# function to remove stopwords
def filter_stopwords(text):
    return " ".join([word for word in word_tokenize(text) if word not in stop_words])


In [67]:
def text_preprocessing(text):
    text = re.sub(clean, ' ', text)
    text = text.lower()
    text = filter_stopwords(text)
    return text

In [68]:
def top_frequency_words(text, ng_range=(1,1), n=None):
    vector = CountVectorizer(ngram_range = ng_range, stop_words = stop_words).fit(text)
    bag_of_words = vector.transform(text)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vector.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [69]:
df['tweet_text']=df['tweet_text'].apply(text_preprocessing)

In [70]:
df.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,words katandandre food crapilicious mkr,not_cyberbullying
1,aussietv white mkr theblock imacelebrityau tod...,not_cyberbullying
2,classy whore red velvet cupcakes,not_cyberbullying
3,meh p thanks heads concerned another angry dud...,not_cyberbullying
4,isis account pretending kurdish account like i...,not_cyberbullying


In [71]:
from nltk.stem import WordNetLemmatizer

lematizer = WordNetLemmatizer()
def lemmatizer_words(text):
    return " ".join([lematizer.lemmatize(word, pos = 'a') for word in text.split()])
df['tweet_text'] = df['tweet_text'].apply(lambda text: lemmatizer_words(text))

In [72]:
df.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,words katandandre food crapilicious mkr,not_cyberbullying
1,aussietv white mkr theblock imacelebrityau tod...,not_cyberbullying
2,classy whore red velvet cupcakes,not_cyberbullying
3,meh p thanks heads concerned another angry dud...,not_cyberbullying
4,isis account pretending kurdish account like i...,not_cyberbullying


In [73]:
def cyberbullying_type_data(cb_type, column_name='cyberbullying_type'):
    subset = df[df[column_name] == cb_type]
    text_data = subset.tweet_text.values
    return text_data

In [74]:
def most_used_words_phrases(cb_data, n=10):
    unigrams = top_frequency_words(cb_data,(1,1),n)
    unigram_data = pd.DataFrame(unigrams, columns = ['Text' , 'count'])
    return unigram_data

In [75]:
gender_data = most_used_words_phrases(cyberbullying_type_data('gender'), 20)
religion_data = most_used_words_phrases(cyberbullying_type_data('religion'), 20)
ethnicity_data = most_used_words_phrases(cyberbullying_type_data('ethnicity'), 20)
age_data = most_used_words_phrases(cyberbullying_type_data('age'), 20)
other_cb_data = most_used_words_phrases(cyberbullying_type_data('other_cyberbullying'), 20)
not_cb_data = most_used_words_phrases(cyberbullying_type_data('not_cyberbullying'), 20)


In [76]:
def create_word_bar(data, title):
    fig = px.bar(data, x = 'Text', y = 'count', color = 'Text',
                     labels={
                         'count': "Word Frequency"
                     },
                     title=title)
    fig.show()

In [77]:
create_word_bar(gender_data, 'Gender')
create_word_bar(religion_data, 'Religion')
create_word_bar(ethnicity_data, 'Ethnicity')
create_word_bar(age_data, 'Age')
create_word_bar(other_cb_data, 'Other Cyberbullying')
create_word_bar(not_cb_data, 'Not Cyberbullying')



In [78]:
def create_cb_sets(cb_type):
    subset = df[df['cyberbullying_type'] == cb_type].tweet_text.values
    unigrams = top_frequency_words(subset,(1,1), 100) # Just considering the top 100 in frequencies
    unigrams_data = pd.DataFrame(unigrams, columns = ['Text' , 'count'])
    return unigrams_data

In [79]:
gender_cb_data = create_cb_sets('gender')
age_cb_data = create_cb_sets('age')
religion_cb_data = create_cb_sets('religion')
ethnicity_cb_data = create_cb_sets('ethnicity')
other_data = create_cb_sets('other_cyberbullying')
not_data = create_cb_sets('not_cyberbullying')

In [80]:
gender_cb_data.shape

(100, 2)

In [81]:
# Performing an inner merge (intersection) of the gender and religion datasets.
intersecting_df = pd.merge(gender_cb_data, religion_cb_data, how='inner', on=['Text'])
modified_idf = intersecting_df.merge(age_cb_data, how='left', on=['Text']) #age

In [82]:
# Renaming the default column names
modified_idf.rename(columns={'count_x': 'gender', 'count_y': 'religion', 'count': 'age'}, inplace=True)

In [83]:
# Left joining the remaining datasets with keeping the Text column as common.
modified_idf = modified_idf.merge(ethnicity_cb_data, how='left', on=['Text']) #ethnicity
modified_idf.rename(columns={'count': 'ethnicity'}, inplace=True)
modified_idf = modified_idf.merge(other_data, how='left', on=['Text']) #other
modified_idf.rename(columns={'count': 'other_cb'}, inplace=True)
modified_idf = modified_idf.merge(not_data, how='left', on=['Text']) #not cb
modified_idf.rename(columns={'count': 'not cyberbullying'}, inplace=True)

In [84]:
# Replacing the Nan values with zero
modified_idf['age'] = modified_idf['age'].fillna(0)
modified_idf['ethnicity'] = modified_idf['ethnicity'].fillna(0)
modified_idf['other_cb'] = modified_idf['other_cb'].fillna(0)
modified_idf['not cyberbullying'] = modified_idf['not cyberbullying'].fillna(0)

In [85]:
modified_idf.isna().sum()

Text                 0
gender               0
religion             0
age                  0
ethnicity            0
other_cb             0
not cyberbullying    0
dtype: int64

In [86]:
modified_idf.head()

Unnamed: 0,Text,gender,religion,age,ethnicity,other_cb,not cyberbullying
0,rt,1254,180,0.0,1223.0,891.0,794.0
1,people,1009,1072,934.0,1180.0,429.0,273.0
2,call,990,183,0.0,275.0,88.0,0.0
3,like,933,1242,1856.0,1009.0,474.0,401.0
4,women,916,318,0.0,0.0,167.0,153.0


In [87]:
# Generating the Paralled Coordinates Plot
fig = px.parallel_coordinates(modified_idf, color='gender', labels={
                "gender": "gender",
                "religion": "religion",
                "age": "age",
                "ethnicity": "ethnicity",
                "other_cb": "other_cb",
                "not cyberbullying": "not cyberbullying"},
                )

fig.show()