-------------------------------------------------------------------------------------------------------------------------------
**Step 1:** Importing All Libraries<br>
In this step, we will be importing all the libraries necessary for this project.
--------------------------------------------------------------------------------------------------------------------------------------

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None

import requests as rq
import numpy as np
import re

#!pip install wordninja
import wordninja

#!pip install spacy
import spacy
#spacy.cli.download("en_core_web_sm")

import nltk
from nltk.tokenize import word_tokenize
#nltk.download('punkt')

#!pip install contractions
import contractions

#!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import sklearn
from sklearn.model_selection import train_test_split
from sklearn import svm, metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn import neighbors

-------------------------------------------------------------------------------------------------------------------------------
**Step 2:** Importing Dataset<br>
In this step, we will import the dataset that we will use in this project.
-------------------------------------------------------------------------------------------------------------------------------

In [2]:
data = pd.read_csv(r"C:\Users\USER\Documents\FinalYearProject\NotBullyingData.csv")
classifyData = pd.read_csv(r"C:\Users\USER\Documents\FinalYearProject\trainingData1.csv")

data.head()

Unnamed: 0,tweets_text,type
0,Watching gaters confuse @ggautoblocker and @th...,notcyberbullying
1,"@Xanthe_Cat nice to know I made him mad, then 😜",notcyberbullying
2,Just hopped on the struggle bus. Gonna be a lo...,notcyberbullying
3,Aw there's nothing to cry about Lynn xx #MKR,notcyberbullying
4,They are literally going thru a laundry list o...,notcyberbullying


-------------------------------------------------------------------------------------------------------------------------------
**Step 3:** Data Preprocessing<br>
In this step, we will perform preprocessing and cleaning of data.
-------------------------------------------------------------------------------------------------------------------------------

In [3]:
#This function will return words from the link provided as a parameter
def returnWords(link):
    raw = rq.get(link).content
    listWords = list(raw.decode().splitlines())
    
    return listWords

#This function will convert a list to a string
def string(data):
    string = ' '
    
    return (string.join(data))

In [4]:
#Lowercasing all the tweets in the dataframe
data['tweets_text'] = data['tweets_text'].str.lower()
classifyData['tweets_text'] = classifyData['tweets_text'].str.lower()

data.head()

Unnamed: 0,tweets_text,type
0,watching gaters confuse @ggautoblocker and @th...,notcyberbullying
1,"@xanthe_cat nice to know i made him mad, then 😜",notcyberbullying
2,just hopped on the struggle bus. gonna be a lo...,notcyberbullying
3,aw there's nothing to cry about lynn xx #mkr,notcyberbullying
4,they are literally going thru a laundry list o...,notcyberbullying


In [5]:
#Regular expressions are used to clean data like @ usernames, links, retweet (RT), digits, any special characters 
temp = ''
for index, row in enumerate(data['tweets_text']):
    temp = re.sub(r'(\brt)|(http\S+)|(\d+)|(&(gt;)+)|(&(lt;)+)|(&(amp;)+)|([^\w\s])', '', str(row))
    temp = re.sub('(\'| )|(\"| )|(_)', ' ', temp)
    data['tweets_text'][index] = temp
    
temp = ''
for index, row in enumerate(classifyData['tweets_text']):
    temp = re.sub(r'(\brt)|(http\S+)|(\d+)|(&(gt;)+)|(&(lt;)+)|(&(amp;)+)|([^\w\s])', '', str(row))
    temp = re.sub('(\'| )|(\"| )|(_)', ' ', temp)
    classifyData['tweets_text'][index] = temp

data.head()

Unnamed: 0,tweets_text,type
0,watching gaters confuse ggautoblocker and theb...,notcyberbullying
1,xanthe cat nice to know i made him mad then,notcyberbullying
2,just hopped on the struggle bus gonna be a lon...,notcyberbullying
3,aw theres nothing to cry about lynn xx mkr,notcyberbullying
4,they are literally going thru a laundry list o...,notcyberbullying


In [6]:
#Contractions are expanded in this section (ex. aren't -> are not, arent -> are not)
for index, row in enumerate(data['tweets_text']):
    temp = []
    for word in row.split():
        temp.append(contractions.fix(word))
    data['tweets_text'][index] = string(temp)
    
for index, row in enumerate(classifyData['tweets_text']):
    temp = []
    for word in row.split():
        temp.append(contractions.fix(word))
    classifyData['tweets_text'][index] = string(temp)
    
data.head()

Unnamed: 0,tweets_text,type
0,watching gaters confuse ggautoblocker and theb...,notcyberbullying
1,xanthe cat nice to know i made him mad then,notcyberbullying
2,just hopped on the struggle bus going to be a ...,notcyberbullying
3,aw there is nothing to cry about lynn xx mkr,notcyberbullying
4,they are literally going thru a laundry list o...,notcyberbullying


In [7]:
#This dataframe contains all the slangs and their respective abbreviations (ex. hml -> hate my life)
slangWords = pd.read_csv(r"C:\Users\USER\Documents\FinalYearProject\SlangUpdated.txt")

for num, row in enumerate(data['tweets_text']):
    temp = []
    for word in row.split():
        found = 0
        if (len(word)<6 and len(word)>2): 
            for index, slang in enumerate(slangWords['slang']):
                if (slang == word):
                    temp.append(slangWords['word'][index])
                    found = 1
        if (found != 1):
            temp.append(word)
    data['tweets_text'][num] = string(temp)
    
for num, row in enumerate(classifyData['tweets_text']):
    temp = []
    for word in row.split():
        found = 0
        if (len(word)<6 and len(word)>2): 
            for index, slang in enumerate(slangWords['slang']):
                if (slang == word):
                    temp.append(slangWords['word'][index])
                    found = 1
        if (found != 1):
            temp.append(word)
    classifyData['tweets_text'][num] = string(temp)
    
data.head()

Unnamed: 0,tweets_text,type
0,watching gaters confuse ggautoblocker and theb...,notcyberbullying
1,xanthe cat nice to know i made him mad then,notcyberbullying
2,just hopped on the struggle bus going to be a ...,notcyberbullying
3,aw there is nothing to cry about lynn xx mkr,notcyberbullying
4,they are literally going thru a laundry list o...,notcyberbullying


In [8]:
#This section expands all munched words (ex. weshouldleave -> we should leave)
for index, row in enumerate(data['tweets_text']):
    temp = []
    for word in row.split():
        if (len(word)>4):
            unmunched = wordninja.split(word)
            temp.append(string(unmunched))
        else:
            temp.append(word)
    data['tweets_text'][index] = string(temp)
    
for index, row in enumerate(classifyData['tweets_text']):
    temp = []
    for word in row.split():
        if (len(word)>4):
            unmunched = wordninja.split(word)
            temp.append(string(unmunched))
        else:
            temp.append(word)
    classifyData['tweets_text'][index] = string(temp)
    
data.head()

Unnamed: 0,tweets_text,type
0,watching gate rs confuse gg auto blocker and t...,notcyberbullying
1,x an the cat nice to know i made him mad then,notcyberbullying
2,just hopped on the struggle bus going to be a ...,notcyberbullying
3,aw there is nothing to cry about lynn xx mkr,notcyberbullying
4,they are literally going thru a laundry list o...,notcyberbullying


In [9]:
# #Scan entity names in the data to create tokens according to it (ex. 'donald trump' as a single token)
# entityRecognition = spacy.load("en_core_web_sm")

# entityScanned = []
# for row in data['tweets_text']:
#     entityScanned.append(entityRecognition(row))

# entity, singleEntity, tokens = [], [], []
# for row in entityScanned:
#     temp = []
#     for word in row:
#         if word.ent_iob_ == 'B': #B shows the start of the entity
#             singleEntity.append(str(word))
#         elif word.ent_iob_ == 'I': #I shows that it is inside an entity
#             singleEntity.append(str(word))
#         elif word.ent_iob_ == 'O': #O shows that it is outside an entity
#             entity.append(' '.join(singleEntity))
#             if (len(singleEntity)>0):
#                 temp.append(' '.join(singleEntity))
#             else:
#                 temp.append(str(word))
#             if (len(singleEntity) > 1):
#                 print(singleEntity)
#             singleEntity = []
#     tokens.append(temp)

tokens = []
for row in data['tweets_text']:
    tokens.append(word_tokenize(row))

data['tokens'] = tokens

tokens = []
for row in classifyData['tweets_text']:
    tokens.append(word_tokenize(row))
        
classifyData['tokens'] = tokens
data.head()

Unnamed: 0,tweets_text,type,tokens
0,watching gate rs confuse gg auto blocker and t...,notcyberbullying,"[watching, gate, rs, confuse, gg, auto, blocke..."
1,x an the cat nice to know i made him mad then,notcyberbullying,"[x, an, the, cat, nice, to, know, i, made, him..."
2,just hopped on the struggle bus going to be a ...,notcyberbullying,"[just, hopped, on, the, struggle, bus, going, ..."
3,aw there is nothing to cry about lynn xx mkr,notcyberbullying,"[aw, there, is, nothing, to, cry, about, lynn,..."
4,they are literally going thru a laundry list o...,notcyberbullying,"[they, are, literally, going, thru, a, laundry..."


-------------------------------------------------------------------------------------------------------------------------------
**Step 4:** Data Transformation<br>
In this step, we are converting data from textual to numeric format to feed to our model.
-------------------------------------------------------------------------------------------------------------------------------

In [10]:
#Counting number of words, offensive words and the severity of the offensive words.
offenseWords = pd.read_csv(r"C:\Users\USER\Documents\FinalYearProject\OffensiveWithSeverity.txt")
negationWords = pd.read_csv(r"C:\Users\USER\Documents\FinalYearProject\Negation.txt")

totalWords, offensiveWords, severityWords = [], [], []

for row in data['tokens']:
    words, temp1, temp2 = 0, [], []
    for index1, token in enumerate(row):
        words += 1
        for index2, offensive in enumerate(offenseWords['word']):
            if (token == offensive):
                negation = 0
                for negation in negationWords['word']: #Checking for negation words at most 2 words before the negative word 
                    if (row[index1-1] == negation or row[index1-2] == negation):
                        negation = 1
                        break
                if (negation != 1):
                    temp1.append(token)
                    temp2.append(offenseWords['severity'][index2])
    totalWords.append(words)
    offensiveWords.append(temp1)
    severityWords.append(temp2)

data['total words'] = totalWords
data['offensive words'] = offensiveWords
data['severity words'] = severityWords
data.head()

Unnamed: 0,tweets_text,type,tokens,total words,offensive words,severity words
0,watching gate rs confuse gg auto blocker and t...,notcyberbullying,"[watching, gate, rs, confuse, gg, auto, blocke...",21,[],[]
1,x an the cat nice to know i made him mad then,notcyberbullying,"[x, an, the, cat, nice, to, know, i, made, him...",12,[mad],[2]
2,just hopped on the struggle bus going to be a ...,notcyberbullying,"[just, hopped, on, the, struggle, bus, going, ...",14,[],[]
3,aw there is nothing to cry about lynn xx mkr,notcyberbullying,"[aw, there, is, nothing, to, cry, about, lynn,...",10,[],[]
4,they are literally going thru a laundry list o...,notcyberbullying,"[they, are, literally, going, thru, a, laundry...",15,[],[]


In [11]:
#Features for the model to predict whether it is offensive or not.
#Density of offensive words in a sentence.
density = []
    
for total, offensive in zip(data['total words'], data['offensive words']):
    density.append(len(offensive) / total)

data['density'] = density
data.head()

Unnamed: 0,tweets_text,type,tokens,total words,offensive words,severity words,density
0,watching gate rs confuse gg auto blocker and t...,notcyberbullying,"[watching, gate, rs, confuse, gg, auto, blocke...",21,[],[],0.0
1,x an the cat nice to know i made him mad then,notcyberbullying,"[x, an, the, cat, nice, to, know, i, made, him...",12,[mad],[2],0.083333
2,just hopped on the struggle bus going to be a ...,notcyberbullying,"[just, hopped, on, the, struggle, bus, going, ...",14,[],[],0.0
3,aw there is nothing to cry about lynn xx mkr,notcyberbullying,"[aw, there, is, nothing, to, cry, about, lynn,...",10,[],[],0.0
4,they are literally going thru a laundry list o...,notcyberbullying,"[they, are, literally, going, thru, a, laundry...",15,[],[],0.0


In [12]:
#Sentimental analysis to determine polarity of data. 
#Compound range from -1 to +1 depending on whether it is negative or positive.

compound = []
for row in data['tweets_text']:
    polarity = SentimentIntensityAnalyzer().polarity_scores(row)
    compound.append(polarity["compound"])

data['sentiment analysis'] = compound
data.head()

Unnamed: 0,tweets_text,type,tokens,total words,offensive words,severity words,density,sentiment analysis
0,watching gate rs confuse gg auto blocker and t...,notcyberbullying,"[watching, gate, rs, confuse, gg, auto, blocke...",21,[],[],0.0,-0.3818
1,x an the cat nice to know i made him mad then,notcyberbullying,"[x, an, the, cat, nice, to, know, i, made, him...",12,[mad],[2],0.083333,-0.1027
2,just hopped on the struggle bus going to be a ...,notcyberbullying,"[just, hopped, on, the, struggle, bus, going, ...",14,[],[],0.0,-0.3182
3,aw there is nothing to cry about lynn xx mkr,notcyberbullying,"[aw, there, is, nothing, to, cry, about, lynn,...",10,[],[],0.0,0.3724
4,they are literally going thru a laundry list o...,notcyberbullying,"[they, are, literally, going, thru, a, laundry...",15,[],[],0.0,0.0


In [13]:
#Weighted mean of severity words

severity, weights = [], [1, 2, 3, 4, 5]
for severe in data['severity words']:
    count, product = [0, 0, 0, 0, 0], []
    for num in severe:
        if (num == 1):
            count[0] += 1
        elif (num == 2):
            count[1] += 1 
        elif (num == 3):
            count[2] += 1
        elif (num == 4):
            count[3] += 1 
        elif (num == 5):
            count[4] += 1       
    for num1, num2 in zip(count, weights):
        product.append(num1 * num2)
    
    totalProduct = sum(product)
    totalCount = sum(count)
    
    if (totalCount == 0):
        severity.append(0)
    else:
        severity.append(totalProduct / totalCount)
        
data['severity'] = severity
data.head()

Unnamed: 0,tweets_text,type,tokens,total words,offensive words,severity words,density,sentiment analysis,severity
0,watching gate rs confuse gg auto blocker and t...,notcyberbullying,"[watching, gate, rs, confuse, gg, auto, blocke...",21,[],[],0.0,-0.3818,0.0
1,x an the cat nice to know i made him mad then,notcyberbullying,"[x, an, the, cat, nice, to, know, i, made, him...",12,[mad],[2],0.083333,-0.1027,2.0
2,just hopped on the struggle bus going to be a ...,notcyberbullying,"[just, hopped, on, the, struggle, bus, going, ...",14,[],[],0.0,-0.3182,0.0
3,aw there is nothing to cry about lynn xx mkr,notcyberbullying,"[aw, there, is, nothing, to, cry, about, lynn,...",10,[],[],0.0,0.3724,0.0
4,they are literally going thru a laundry list o...,notcyberbullying,"[they, are, literally, going, thru, a, laundry...",15,[],[],0.0,0.0,0.0


-------------------------------------------------------------------------------------------------------------------------------
**Step 5:** Machine Learning<br>
In this step, we are training and then testing our model.
-------------------------------------------------------------------------------------------------------------------------------

-------------------------------------------------------------------------------------------------------------------------------
**Part A:** 
In this part, we will train our model to predict whether a tweet is offensive or not.
-------------------------------------------------------------------------------------------------------------------------------

In [14]:
predictionDataM1 = data[['density', 'severity', 'sentiment analysis']].copy()
targetM1 = data['type']

In [15]:
#!pip install lazypredict
import lazypredict
from lazypredict.Supervised import LazyClassifier
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)

trainData, testData, trainTarget, testTarget = train_test_split(predictionDataM1, targetM1, test_size = 0.15, random_state = 30, stratify = targetM1)

models, predictions = clf.fit(trainData, testData, trainTarget, testTarget)
print(models)

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 30.04it/s]

                               Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                          
SVC                                0.88               0.88    None      0.88   
NuSVC                              0.88               0.88    None      0.88   
AdaBoostClassifier                 0.84               0.84    None      0.84   
CalibratedClassifierCV             0.84               0.84    None      0.84   
LogisticRegression                 0.84               0.84    None      0.84   
LinearDiscriminantAnalysis         0.83               0.83    None      0.83   
RidgeClassifierCV                  0.83               0.83    None      0.83   
RidgeClassifier                    0.83               0.83    None      0.83   
NearestCentroid                    0.83               0.83    None      0.83   
LinearSVC                          0.83               0.83    None      0.83   
PassiveAggressiveClassifier        0.81 




-------------------------------------------------------------------------------------------------------------------------------
**Part B:** 
In this part, we will train our model to predict whether the offensive tweet is related to religion, age, gender or race/ethnicity.
-------------------------------------------------------------------------------------------------------------------------------

In [16]:
ethnicityAndRaceGlossary = pd.read_csv(r"C:\Users\USER\Documents\FinalYearProject\EthnicityAndRaceGlossary.txt")

isEthnicityAndRace = []
for row in classifyData['tokens']:
    temp = 0
    for token in row:
        for glossary in ethnicityAndRaceGlossary['word']:
            if (token == glossary):
                temp += 1
                break
    isEthnicityAndRace.append(temp)

classifyData['ethnicity and race'] = isEthnicityAndRace
classifyData.head() 

Unnamed: 0,tweets_text,type,tokens,ethnicity and race
0,del en as dictator paul makes gay rape jokes a...,gender,"[del, en, as, dictator, paul, makes, gay, rape...",0
1,arabs and muslims do not supporting you do not...,religion,"[arabs, and, muslims, do, not, supporting, you...",0
2,is not this a trope in like every teen movie e...,gender,"[is, not, this, a, trope, in, like, every, tee...",0
3,dumb goth fuck you nigger blocked,ethnicityandrace,"[dumb, goth, fuck, you, nigger, blocked]",1
4,nigga i am laying down try na sleep get to bed...,ethnicityandrace,"[nigga, i, am, laying, down, try, na, sleep, g...",3


In [17]:
ageDataGlossary = pd.read_csv(r"C:\Users\USER\Documents\FinalYearProject\AgeGlossary.txt")

isAge = []
for row in classifyData['tokens']:
    temp = 0
    for token in row:
        for glossary in ageDataGlossary['word']:
            if (token == glossary):
                temp += 1
                break
    isAge.append(temp)

classifyData['age'] = isAge
classifyData.head() 

Unnamed: 0,tweets_text,type,tokens,ethnicity and race,age
0,del en as dictator paul makes gay rape jokes a...,gender,"[del, en, as, dictator, paul, makes, gay, rape...",0,0
1,arabs and muslims do not supporting you do not...,religion,"[arabs, and, muslims, do, not, supporting, you...",0,1
2,is not this a trope in like every teen movie e...,gender,"[is, not, this, a, trope, in, like, every, tee...",0,3
3,dumb goth fuck you nigger blocked,ethnicityandrace,"[dumb, goth, fuck, you, nigger, blocked]",1,1
4,nigga i am laying down try na sleep get to bed...,ethnicityandrace,"[nigga, i, am, laying, down, try, na, sleep, g...",3,3


In [18]:
genderDataGlossary = pd.read_csv(r"C:\Users\USER\Documents\FinalYearProject\GenderGlossary.txt")

isGender = []
for row in classifyData['tokens']:
    temp = 0
    for token in row:
        for glossary in genderDataGlossary['word']:
            if (token == glossary):
                temp += 1
                break
    isGender.append(temp)
    
classifyData['gender'] = isGender
classifyData.head() 

Unnamed: 0,tweets_text,type,tokens,ethnicity and race,age,gender
0,del en as dictator paul makes gay rape jokes a...,gender,"[del, en, as, dictator, paul, makes, gay, rape...",0,0,4
1,arabs and muslims do not supporting you do not...,religion,"[arabs, and, muslims, do, not, supporting, you...",0,1,0
2,is not this a trope in like every teen movie e...,gender,"[is, not, this, a, trope, in, like, every, tee...",0,3,3
3,dumb goth fuck you nigger blocked,ethnicityandrace,"[dumb, goth, fuck, you, nigger, blocked]",1,1,1
4,nigga i am laying down try na sleep get to bed...,ethnicityandrace,"[nigga, i, am, laying, down, try, na, sleep, g...",3,3,2


In [19]:
religiousDataGlossary = pd.read_csv(r"C:\Users\USER\Documents\FinalYearProject\ReligionGlossary.txt")

isReligious = []
for row in classifyData['tokens']:
    temp = 0
    for token in row:
        for glossary in religiousDataGlossary['word']:
            if (token == glossary):
                temp += 1
                break
    isReligious.append(temp)

classifyData['religion'] = isReligious
classifyData.head() 

Unnamed: 0,tweets_text,type,tokens,ethnicity and race,age,gender,religion
0,del en as dictator paul makes gay rape jokes a...,gender,"[del, en, as, dictator, paul, makes, gay, rape...",0,0,4,0
1,arabs and muslims do not supporting you do not...,religion,"[arabs, and, muslims, do, not, supporting, you...",0,1,0,1
2,is not this a trope in like every teen movie e...,gender,"[is, not, this, a, trope, in, like, every, tee...",0,3,3,0
3,dumb goth fuck you nigger blocked,ethnicityandrace,"[dumb, goth, fuck, you, nigger, blocked]",1,1,1,0
4,nigga i am laying down try na sleep get to bed...,ethnicityandrace,"[nigga, i, am, laying, down, try, na, sleep, g...",3,3,2,1


In [20]:
predictionDataM2 = classifyData[['age', 'gender', 'religion', 'ethnicity and race']].copy()
targetM2 = classifyData['type']

#!pip install lazypredict
import lazypredict
from lazypredict.Supervised import LazyClassifier
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)

trainData, testData, trainTarget, testTarget = train_test_split(predictionDataM2, targetM2, test_size = 0.15, random_state = 111, stratify = targetM2)
models, predictions = clf.fit(trainData, testData, trainTarget, testTarget)
print(models)


100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:01<00:00, 21.60it/s]

                               Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                          
ExtraTreeClassifier                0.95               0.95    None      0.95   
QuadraticDiscriminantAnalysis      0.94               0.94    None      0.94   
LogisticRegression                 0.94               0.94    None      0.94   
SVC                                0.93               0.93    None      0.93   
LinearSVC                          0.93               0.93    None      0.93   
LGBMClassifier                     0.93               0.93    None      0.93   
CalibratedClassifierCV             0.93               0.93    None      0.93   
LabelPropagation                   0.93               0.93    None      0.93   
LabelSpreading                     0.93               0.93    None      0.93   
NuSVC                              0.93               0.93    None      0.93   
DecisionTreeClassifier             0.93 


