In [14]:
#Importing dependencies that are required for building the model and running the code 
import pandas as pd
from sklearn import tree
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier,LogisticRegression 
from sklearn.ensemble import BaggingClassifier,RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import matplotlib.pyplot as plt
import io
import random;
import tensorflow as tf

In [2]:
#Read the data from the csv files loaded 
sarcasm_bullying_data  = pd.read_csv(r'./dataset/sarcastic_data.csv')
non_sarcasm_bullying_data  = pd.read_csv(r'./dataset/non-sarcastic_data.csv')

In [3]:
#Convert the loaded dataset into list
complete_sarcastic_list = sarcasm_bullying_data.values.tolist()
complete_non_sarcastic_list = non_sarcasm_bullying_data.values.tolist()

#Merge non sarcastic list in sarcastic list
complete_sarcastic_list.extend(complete_non_sarcastic_list)

#shuffling the dataset
for i in range(0,120):
    random.shuffle(complete_sarcastic_list)

#Creatng a complete set of data
complete_set = complete_sarcastic_list

#Separating sentences and labels into two different list
complete_comments = []
labels = []

for each in complete_set:
    complete_comments.append(each[0])
    labels.append(each[1])


In [4]:
#defining the size for training and testing
training_size = 793  #denotes 60% of the data, so 40% of the data is used for testing

#Separating the complete set of sentences of data into training set
training_sentence = complete_comments[0:training_size]

#Separating the complete set of sentences of data into testing set
testing_sentence = complete_comments[training_size:]

#Separating the complete set of labels of data into testing set
training_labels = labels[0:training_size]

#Separating the complete set of labels of data into testing set
testing_labels = labels[training_size:]

In [5]:
#Use tfidf to extract all the unique words in the sentence (each unique words is treated as a feature)
vectorizer = TfidfVectorizer(stop_words='english')

#Extract features for training set
vectors_train = vectorizer.fit_transform(training_sentence)
feature_names = vectorizer.get_feature_names()
dense = vectors_train.todense()
denselist = dense.tolist()

#Create a 2D matrix where rows denote training sentences, and the columns denote unique features 
training_samples = pd.DataFrame(denselist, columns=feature_names)

#Extract features for testing set
vectors_test =  vectorizer.transform(testing_sentence)
feature_names = vectorizer.get_feature_names()
dense = vectors_test.todense()
denselist = dense.tolist()

# #Create a 2D matrix where rows denote testing sentences, and the columns denote unique features 
testing_samples = pd.DataFrame(denselist, columns=feature_names)

### Naive Bayes

In [6]:
#Use Naive Bayes to train the model
nb_model = MultinomialNB().fit(training_samples, training_labels)

#Predicting the accuracy for Naive Bayes
predicted_nb = nb_model.predict(testing_samples)
print("The accuracy of the predicted data is",np.mean(predicted_nb == testing_labels)*100)

The accuracy of the predicted data is 88.17204301075269


### Linear SVM

In [7]:
#Use Linear Support Vector Machine(SVM) to train the model
svm_model = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, max_iter=5, random_state=42)
temp = svm_model.fit(training_samples,training_labels)

#Predicting the accuracy for Linear SVM
predicted_svm = svm_model.predict(testing_samples)
print("The accuracy of the predicted data is",np.mean(predicted_svm == testing_labels)*100)

The accuracy of the predicted data is 91.39784946236558




### Logistic Regression

In [8]:
#Use Logistic Regression(LR) to train the model
logistic_reg_model = LogisticRegression(C=1e5)
logistic_reg = logistic_reg_model.fit(training_samples,training_labels)

#Predicting the accuracy for Logistic Regression
predicted_log = logistic_reg_model.predict(testing_samples)
print("The accuracy of the predicted data is",np.mean(predicted_log == testing_labels)*100)

The accuracy of the predicted data is 88.17204301075269


### Decision Trees

In [9]:
#Define the model
dec_trees_model = tree.DecisionTreeClassifier()

#Use Decision Trees to train the model
dec_trees_model.fit(training_samples, training_labels)

#Predicting the accuracy for Decision trees
print("The accuracy of the predicted data is",dec_trees_model.score(testing_samples,testing_labels)*100)

The accuracy of the predicted data is 86.02150537634408


### Random Forest

In [10]:
#Define the model
random_fr_model=RandomForestClassifier(n_estimators=100)

#Use Random Forest to train the model
random_fr_model.fit(training_samples, training_labels)

#Predicting the accuracy for Random Forest
print("The accuracy of the predicted data is",random_fr_model.score(testing_samples,testing_labels)*100)

The accuracy of the predicted data is 89.24731182795699


### Ensemble Method

In [11]:
#Use Ensamble to train the model
ensemble_model = BaggingClassifier(tree.DecisionTreeClassifier(random_state=1))
ensemble_model.fit(training_samples, training_labels)

#Preciting the accuracy for Ensemble method
print("The accuracy of the predicted data is",ensemble_model.score(testing_samples,testing_labels)*100)

The accuracy of the predicted data is 87.09677419354838


### Custom Input

In [18]:
#Custom Input 
new_input=[input("Enter a Sentence:")]
vectors_test =  vectorizer.transform(new_input)
feature_names = vectorizer.get_feature_names()
dense = vectors_test.todense()
denselist = dense.tolist()

#Create a 2D matrix where rows denote testing sentences, and the columns denote unique features 
new_input_sample = pd.DataFrame(denselist, columns=feature_names)

#For Baseline Models
new_output = logistic_reg_model.predict(new_input_sample)
print("The prediction of Logistric Regression is:",new_output[0])

new_output = nb_model.predict(new_input_sample)
print("The prediction of Naive Bayes is:",new_output[0])

new_output = ensemble_model.predict(new_input_sample)
print("The prediction of Ensemble method is:",new_output[0])

new_output = random_fr_model.predict(new_input_sample)
print("The prediction of Random Forest is:",new_output[0])

new_output = dec_trees_model.predict(new_input_sample)
print("The prediction of Decision Tree is:",new_output[0])


Enter a Sentence:you are a bitch
The prediction of Logistric Regression is: 0
The prediction of Naive Bayes is: 0
The prediction of Ensemble method is: 0
The prediction of Random Forest is: 0
The prediction of Decision Tree is: 0
