LIBRARY IMPORT

In [None]:
!pip install emoji

In [None]:
import io
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, precision_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from emoji import emojize
import emoji
from wordcloud import WordCloud, STOPWORDS
import re,string, nltk
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.stem.snowball import SnowballStemmer
import warnings
warnings.filterwarnings(action="ignore")
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

READ DATASET

In [None]:
df = pd.read_csv('/content/sample_data/cyberbullying_tweets.csv')
df.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying


RENAME COLUMNS

In [None]:
df = df.rename(columns={"tweet_text":"text","cyberbullying_type":"sentiment"})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47692 entries, 0 to 47691
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       47692 non-null  object
 1   sentiment  47692 non-null  object
dtypes: object(2)
memory usage: 745.3+ KB


In [None]:
(df.isna().sum())/(len(df))

text         0.0
sentiment    0.0
dtype: float64

In [None]:
df.sentiment.value_counts()

religion               7998
age                    7992
gender                 7973
ethnicity              7961
not_cyberbullying      7945
other_cyberbullying    7823
Name: sentiment, dtype: int64

NLP PROCEDURE

DATA CLEANING

In [None]:
# function for cleaning tweets
def clean_tweet(df,field):
    df[field] = df[field].str.replace(r"http\S+"," ")
    df[field] = df[field].str.replace(r"http"," ")
    df[field] = df[field].str.replace(r"@","at")
    df[field] = df[field].str.replace("#[A-Za-z0-9_]+", ' ')
    df[field] = df[field].str.replace(r"[^A-Za-z(),!?@\'\"_\n]"," ")
    df[field] = df[field].str.lower()
    return df

In [None]:
clean_tweet(df,"text")

Unnamed: 0,text,sentiment
0,"in other words , your food was crapilicious!",not_cyberbullying
1,why is so white?,not_cyberbullying
2,atxochitlsuckkks a classy whore? or more red v...,not_cyberbullying
3,"atjason_gio meh p thanks for the heads up, ...",not_cyberbullying
4,atrudhoeenglish this is an isis account preten...,not_cyberbullying
...,...,...
47687,"black ppl aren't expected to do anything, depe...",ethnicity
47688,turner did not withhold his disappointment tu...,ethnicity
47689,i swear to god this dumb nigger bitch i have...,ethnicity
47690,yea fuck you rt attherealexel if youre a nigg...,ethnicity


LEMMATIZATION

In [None]:
# Applying Lemmmatizer to remove tenses from texts.
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer("english")
STOPWORDS.update(['rt', 'mkr', 'didn', 'bc', 'n', 'm', 
                  'im', 'll', 'y', 've', 'u', 'ur', 'don', 
                  'p', 't', 's', 'aren', 'kp', 'o', 'kat', 
                  'de', 're', 'amp', 'will'])
corpus = []
def preprocess_tweet(tweet):
    tweet = re.sub(r"won\'t", "will not", tweet)
    tweet = re.sub(r"can\'t", "can not", tweet)
    tweet = re.sub(r"n\'t", " not", tweet)
    tweet = re.sub(r"\'re", " are", tweet)
    tweet = re.sub(r"\'s", " is", tweet)
    tweet = re.sub(r"\'d", " would",tweet)
    tweet = re.sub(r"\'ll", " will", tweet)
    tweet = re.sub(r"\'t", " not", tweet)
    tweet = re.sub(r"\'ve", " have", tweet)
    tweet = re.sub(r"\'m", " am", tweet)
    tweet = re.sub('[^a-zA-Z]',' ',tweet)
    tweet = re.sub(emoji.get_emoji_regexp(),"",tweet)
    tweet = re.sub(r'[^\x00-\x7f]','',tweet)
    tweet = " ".join([stemmer.stem(word) for word in tweet.split()])
    tweet = [lemmatizer.lemmatize(word) for word in tweet.split() if not word in set(STOPWORDS)]
    tweet = ' '.join(tweet)
    return tweet

df["text_clean"] = df["text"].apply(preprocess_tweet)

REMOVE DUPLICATE DATA

In [None]:
df.drop_duplicates("text_clean", inplace=True)

REMOVING A COLUMN (OTHER_CYBERBULLYING)

In [None]:
# removing other_cyberbullying category as it doesnot contribute much.
df = df[df["sentiment"]!="other_cyberbullying"]
df.sentiment.value_counts()

religion             7942
age                  7863
ethnicity            7830
not_cyberbullying    7777
gender               7653
Name: sentiment, dtype: int64

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39065 entries, 0 to 47691
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   text        39065 non-null  object
 1   sentiment   39065 non-null  object
 2   text_clean  39065 non-null  object
dtypes: object(3)
memory usage: 1.2+ MB


ADDING LABELS

In [None]:
labels = {"not_cyberbullying":0,"gender":1,"ethnicity":2,"religion":3,"age":4}
labels

{'age': 4, 'ethnicity': 2, 'gender': 1, 'not_cyberbullying': 0, 'religion': 3}

In [None]:
corpus, target_labels, target_names = (df['text_clean'], [labels[label] for label in df['sentiment']],df['sentiment'])
df_new = pd.DataFrame({"text_clean":corpus,"sentiment Label": target_labels,"sentiment names": target_names})
df_new

Unnamed: 0,text_clean,sentiment Label,sentiment names
0,word food crapilici,0,not_cyberbullying
1,whi white,0,not_cyberbullying
2,atxochitlsuckkk classi whore red velvet cupcak,0,not_cyberbullying
3,atjason gio meh thank head concern anoth angri...,0,not_cyberbullying
4,atrudhoeenglish isi account pretend kurdish ac...,0,not_cyberbullying
...,...,...,...
47687,black ppl expect anyth depend anyth yet free p...,2,ethnicity
47688,turner withhold disappoint turner call court a...,2,ethnicity
47689,swear god dumb nigger bitch got bleach hair re...,2,ethnicity
47690,yea fuck attherealexel nigger fuck unfollow fu...,2,ethnicity


In [None]:
corpus

0                                      word food crapilici
1                                                whi white
2           atxochitlsuckkk classi whore red velvet cupcak
3        atjason gio meh thank head concern anoth angri...
4        atrudhoeenglish isi account pretend kurdish ac...
                               ...                        
47687    black ppl expect anyth depend anyth yet free p...
47688    turner withhold disappoint turner call court a...
47689    swear god dumb nigger bitch got bleach hair re...
47690    yea fuck attherealexel nigger fuck unfollow fu...
47691    bro gotta chill atchillshrammi dog fuck dumb n...
Name: text_clean, Length: 39065, dtype: object

SPLITTING DATASET - TRAIN, TEST

In [None]:
X_train, X_test, y_train, y_test = train_test_split(np.array(df_new["text_clean"]),np.array(df_new["sentiment Label"]), test_size=0.25, random_state=0)
display(X_train.shape)
display(X_test.shape)

(29298,)

(9767,)

In [None]:
(unique, counts) = np.unique(y_train, return_counts=True)
np.asarray((unique, counts)).T

array([[   0, 5833],
       [   1, 5760],
       [   2, 5872],
       [   3, 5977],
       [   4, 5856]])

TF-IDF

In [None]:
tfidf = TfidfVectorizer(use_idf=True, tokenizer=word_tokenize,min_df=0.00002,max_df=0.70)
X_train_tf = tfidf.fit_transform(X_train.astype('U'))
X_test_tf = tfidf.transform(X_test.astype('U'))

print(f"TF_IDF Model: Train features shape:{X_train_tf.shape} and Test features shape:{X_test_tf.shape}")

TF_IDF Model: Train features shape:(29298, 31320) and Test features shape:(9767, 31320)


RANDOM FOREST MODEL

In [None]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_tf, y_train)


RandomForestClassifier(random_state=42)

In [None]:
y_pred = rf.predict(X_test_tf)

In [None]:
y_pred

array([0, 0, 1, ..., 2, 2, 4])

ACCURACY

In [None]:
accuracy_score(y_pred, y_test)

0.9339612982492065

PREDICTING TEXT SENTIMENT

In [None]:
x = [' @samxx#$ hates muslims']

PRE-PROCESS GIVEN TEXT

In [None]:
x[0] = x[0].replace(r"http\S+"," ")
x[0]  = x[0].replace(r"http"," ")
x[0]  = x[0].replace(r"@","at")
x[0]  = x[0].replace("#[A-Za-z0-9_]+", ' ')
x[0]  = x[0].replace(r"[^A-Za-z(),!?@\'\"_\n]"," ")
x[0]  = x[0].lower()


In [None]:
x[0]

' atsamxx#$ hates muslims'

In [None]:
x.append(preprocess_tweet(x[0]))

In [None]:
x[1]

'atsamxx hate muslim'

In [None]:
new = tfidf.transform([x[1]])

In [None]:
new

<1x31320 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [None]:
predict = rf.predict(new)

In [None]:
l = predict
lc = l[0]
print('Category',*l)

for i in labels:
  if (labels[i] == l[0]):
    print(i)

Category 3
religion


In [None]:
import pickle
pickle.dump(tfidf, open("vectorizer.pickle", "wb"))
pickle.dump(rf, open("model.pickle", "wb"))