<a href="https://colab.research.google.com/github/Shreesh09/Detection-of-textual-CyberBullying-using-ML/blob/main/CyberBullying-Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing the libraries

In [32]:
# Import necessary libraries
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn import model_selection, svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Data Preprocessing

In [4]:
# Load data
dataset1 = pd.read_csv("twitter_racism_parsed_dataset.csv")
dataset2 = pd.read_csv("twitter_sexism_parsed_dataset.csv")
datasets = [dataset1, dataset2]

In [5]:
# Remove special characters, numbers, and URLs
def remove(data):
  data.loc[:, 'Text'] = data.loc[:, 'Text'].replace('[^a-zA-Z]', ' ', regex=True)
  return data

In [6]:
# Lowercase the text
def LC(data):
  data.loc[:, 'Text'] = data.loc[:, 'Text'].apply(lambda x: x.lower())
  return data

In [7]:
# Remove stopwords
def removeSW(data):
  stop_words = set(stopwords.words('english'))
  data.loc[:, 'Text'] = data.loc[:, 'Text'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))
  return data

In [8]:
# Perform stemming
def Stem(data):
  stemmer = SnowballStemmer('english')
  data.loc[:, 'Text'] = data.loc[:, 'Text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x)]))
  return data

In [9]:
def Empty(df):
  #removing Empty values
  df.dropna(subset=['Text'], inplace=True)
  df.dropna(subset=['oh_label'], inplace=True)
  return df

In [10]:
def Duplicate(df):
  #Removing duplicates
  df = df.drop_duplicates(subset='Text', keep='first')
  return df

In [11]:
def clean(dataset):
  dataset = Empty(dataset)
  dataset = remove(dataset)
  dataset = LC(dataset)
  dataset = removeSW(dataset)
  dataset = Stem(dataset)
  dataset = Duplicate(dataset)

In [12]:
for dataset in datasets:
  dataset = clean(dataset)

In [13]:
print(datasets[1].iloc[2, 2])

rt eccl everyon underestim us still well underestim judg mkr


Splitting of Dataset

In [14]:
def Split(data):
  X = data.loc[:, "Text"].values
  Y = data.loc[:, "oh_label"].values
  X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y,test_size=0.20)
  splits = {'X_train': X_train, 'X_test': X_test, 'Y_train': Y_train, 'Y_test': Y_test}
  return splits


Encoding Categorical values

In [15]:
def fitVectorizer(X):
  vectorizer = TfidfVectorizer(lowercase=False, use_idf=True)
  vectorizer.fit(X)
  return vectorizer

def Encode(X, vectorizer):
  vect = vectorizer.transform(X)
  return vect

Training the Model


In [44]:
models = []
vectTests = []
Y_tests = []
vectorizers = []
# Iterate through the datasets
for dataset in datasets:
    # Split the dataset into training and test sets
    df = Split(dataset)
    #print(df.iloc[0,0])
    X_train = df['X_train']
    X_test = df['X_test']
    Y_train = df['Y_train']
    Y_test = df['Y_test']


    vectorizer = fitVectorizer(X_train)
    vectTrain = Encode(X_train, vectorizer)
    vectTest = Encode(X_test, vectorizer)
    
    #model = svm.SVC(kernel="linear")
    #model = LogisticRegression()
    model = RandomForestClassifier()

    #fitting training data into the algorithm 
    model.fit(vectTrain, Y_train)
    
    # Append the model to the list of models
    models.append(model)
    vectTests.append(vectTest)
    Y_tests.append(Y_test)
    vectorizers.append(vectorizer)

Predictions

In [45]:
# Iterate through the models and their corresponding test sets
for model, vectTest, Y_test in zip(models, vectTests, Y_tests):
    # Make predictions on the test set
    Y_pred = model.predict(vectTest)
    
    # Calculate the accuracy
    accuracy = accuracy_score(Y_test, Y_pred)
    
    # Print the accuracy
    print("Accuracy: ", accuracy)

Accuracy:  0.9257884972170687
Accuracy:  0.8968413978494624


Testing

In [46]:
while(1):
  str = input("Enter a sentence: ")
  dict = {'Text': [str], 'oh_label': [0]}
  df = pd.DataFrame(dict)
  df = remove(df)
  df = LC(df)
  df = removeSW(df)
  dataset = Stem(df)
  predictions = []

  for model, vectorizer in zip(models, vectorizers):
    vectDf = Encode(df['Text'], vectorizer)
    prediction = model.predict(vectDf)
    predictions.append(prediction)

  if(predictions[0] == 1 and predictions[1] == 1):
    print("Sexist and Racist\n")
  elif predictions[0] == 1:
    print("Racist\n")
  elif predictions[1] == 1:
    print("Sexist\n")
  else:
    print("Not Cyber Bullying\n")
  
  c = input("Do you wish to continue? y/n\n")
  if(c == 'n'):
    break;

Enter a sentence: @Alfonso_AraujoG @ardiem1m @MaxBlumenthal It has nothing to do with their grandpas. It is inherited with their religion.
Not Cyber Bullying

Do you wish to continue? y/n
y
Enter a sentence: @Ammaawah @jm111t You are following the religion of ignorance with an illiterate prophet and you want to talk about spelling? LOL.
Racist

Do you wish to continue? y/n
y
Enter a sentence: @ummsuhaym @logicalmind11 Quran 8.12 would be a good example of terrorism. http://t.co/vonYOAtpfk
Racist

Do you wish to continue? y/n
y
Enter a sentence: Wheres the sudden death cook off? how do they die? Can we get tickets? #Mkr
Not Cyber Bullying

Do you wish to continue? y/n
n
