<a href="https://colab.research.google.com/github/Shreesh09/Detection-of-textual-CyberBullying-using-ML/blob/main/CyberBullying-Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing the libraries

In [66]:
# Import necessary libraries
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn import model_selection, svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import numpy as np

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Data Preprocessing

In [67]:
# Load data
dataset1 = pd.read_csv("twitter_racism_parsed_dataset.csv")
dataset2 = pd.read_csv("twitter_sexism_parsed_dataset.csv")
datasets = [dataset1, dataset2]

In [68]:
# Remove special characters, numbers, and URLs
def remove(data):
  data.loc[:, 'Text'] = data.loc[:, 'Text'].replace('[^a-zA-Z]', ' ', regex=True)
  return data

In [69]:
# Lowercase the text
def LC(data):
  data.loc[:, 'Text'] = data.loc[:, 'Text'].apply(lambda x: x.lower())
  return data

In [70]:
# Remove stopwords
def removeSW(data):
  stop_words = set(stopwords.words('english'))
  data.loc[:, 'Text'] = data.loc[:, 'Text'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))
  return data

In [71]:
# Perform stemming
def Stem(data):
  stemmer = SnowballStemmer('english')
  data.loc[:, 'Text'] = data.loc[:, 'Text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x)]))
  return data

In [72]:
def Empty(df):
  #removing Empty values
  df.dropna(subset=['Text'], inplace=True)
  df.dropna(subset=['oh_label'], inplace=True)
  return df

In [73]:
def Duplicate(df):
  #Removing duplicates
  df = df.drop_duplicates(subset='Text', keep='first')
  return df

In [74]:
for dataset in datasets:
  dataset = Empty(dataset)
  dataset = remove(dataset)
  dataset = LC(dataset)
  dataset = removeSW(dataset)
  dataset = Stem(dataset)
  dataset = Duplicate(dataset)

In [75]:
print(datasets[1].iloc[2, 2])

rt eccl everyon underestim us still well underestim judg mkr


Splitting of Dataset

In [76]:
def Split(data):
  X = data.loc[:, "Text"].values
  Y = data.loc[:, "oh_label"].values
  X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y,test_size=0.20)
  splits = {'X_train': X_train, 'X_test': X_test, 'Y_train': Y_train, 'Y_test': Y_test}
  return splits


Encoding Categorical values

In [77]:
def fitVectorizer(X):
  vectorizer = TfidfVectorizer(lowercase=False, use_idf=True)
  vectorizer.fit(X)
  return vectorizer

def Encode(X, vectorizer):
  vect = vectorizer.transform(X)
  return vect

Training the Model

In [78]:
models = []
vectTests = []
Y_tests = []
# Iterate through the datasets
for dataset in datasets:
    # Split the dataset into training and test sets
    df = Split(dataset)
    #print(df.iloc[0,0])
    X_train = df['X_train']
    X_test = df['X_test']
    Y_train = df['Y_train']
    Y_test = df['Y_test']


    vectorizer = fitVectorizer(X_train)
    vectTrain = Encode(X_train, vectorizer)
    vectTest = Encode(X_test, vectorizer)
    
    model = svm.SVC(kernel="linear")

    #fitting training data into the algorithm 
    model.fit(vectTrain, Y_train)
    
    # Append the model to the list of models
    models.append(model)
    vectTests.append(vectTest)
    Y_tests.append(Y_test)

Predictions

In [79]:
# Iterate through the models and their corresponding test sets
for model, vectTest, Y_test in zip(models, vectTests, Y_tests):
    # Make predictions on the test set
    Y_pred = model.predict(vectTest)
    
    # Calculate the accuracy
    accuracy = accuracy_score(Y_test, Y_pred)
    
    # Print the accuracy
    print("Accuracy: ", accuracy)

Accuracy:  0.9384044526901669
Accuracy:  0.8928091397849462
