In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

In [2]:
data=pd.read_csv('Files/cyberbullying_tweets.csv')
data

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying
...,...,...
47687,"Black ppl aren't expected to do anything, depe...",ethnicity
47688,Turner did not withhold his disappointment. Tu...,ethnicity
47689,I swear to God. This dumb nigger bitch. I have...,ethnicity
47690,Yea fuck you RT @therealexel: IF YOURE A NIGGE...,ethnicity


In [23]:
data.cyberbullying_type.unique()

array(['not_cyberbullying', 'gender', 'religion', 'other_cyberbullying',
       'age', 'ethnicity'], dtype=object)

In [3]:
#removing stop words from text
def clean_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return " ".join([word for word in str(text).split() if word not in stop_words])

# cleaning and removing punctuations
def clean_puctuations(text):
    english_puctuations = string.punctuation
    translator =str.maketrans('','', english_puctuations)
    return text.translate(translator)

# cleaning and removing repeating characters
def clean_repeating_characters(text):
    return re.sub(r'(.)1+', r'1', text)

# cleaning and removing URLs
def clean_url(text):
    return re.sub(r"((www.[^s]+)|(http\S+))","",text)

# cleaning and removing numeric data

def clean_Numeric(text):
    return re.sub('[0-9]+','',text)

#Stemming
def text_stemming(text):
    st =nltk.PorterStemmer()
    text=[st.stem(word) for word in text]
    return text

# lemmatization
def text_lemmatization(text):
    lm=nltk.WordNetLemmatizer()
    text=[lm.lemmatize(word) for word in text]
    return text

In [4]:
tweets, type= list(data['tweet_text']), list(data['cyberbullying_type'])
labelencoder = LabelEncoder()
data['cyberbullying_type_encoded']=labelencoder.fit_transform(data['cyberbullying_type'])

In [5]:
# converting tweet text to lower case
data['tweet_text'] =data['tweet_text'].str.lower()
data['tweet_text']=data['tweet_text'].apply(lambda text: clean_stopwords(text))
data['tweet_text'] = data['tweet_text'].apply(lambda x : clean_puctuations(x))
data['tweet_text'] = data['tweet_text'].apply(lambda x :clean_repeating_characters(x))
data['tweet_text'] =data['tweet_text'].apply(lambda x :clean_url(x))
data['tweet_text']=data['tweet_text'].apply(lambda x: clean_Numeric(x))
tokenizer = RegexpTokenizer('\w+')
data['tweet_text']=data['tweet_text'].apply(tokenizer.tokenize)
data['tweet_text']=data['tweet_text'].apply(lambda x: text_stemming(x))
data['tweet_text']=data['tweet_text'].apply(lambda x: text_lemmatization(x))
data['tweet_text'] = data['tweet_text'].apply(lambda x : " ".join(x))

In [6]:
X= data['tweet_text']
y=data['cyberbullying_type_encoded']
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.3, random_state= 41)

In [7]:
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features= 500000)
vectoriser.fit(X_train)
print("No. of feature words: ",len(vectoriser.get_feature_names_out()))

No. of feature words:  308359


In [8]:
X_train=vectoriser.transform(X_train)
X_test=vectoriser.transform(X_test)

context={"data":"Tweet Data Proccessed With NLTK And No. of feature words Found In Tweets: "+str(len(vectoriser.get_feature_names_out()))}

In [13]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
rfacc = accuracy_score(y_test, y_pred)
print("RandomForest Accurary: ",rfacc)

RandomForest Accurary:  0.8221973720995247


In [14]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
dtacc = accuracy_score(y_test, y_pred)
print("DecisionTree Accurary: ",dtacc)

DecisionTree Accurary:  0.7917249091417389


In [21]:
svc = svm.SVC(kernel='linear', C=1, random_state=42)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
svcacc = accuracy_score(y_test, y_pred)
print("SVM Accurary: ",svcacc)

SVM Accurary:  0.8291864691081913


In [22]:
pickle.dump(vectoriser, open('Files/CD_vectoriser.pkl', 'wb'))
pickle.dump(svc, open('Files/CD_svm.pkl', 'wb'))
pickle.dump(labelencoder, open('Files/CD_labelencoder.pkl', 'wb'))