In [6]:
import pandas as pd
import numpy as np

from sklearn.naive_bayes import MultinomialNB  
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

<h3>Read the dataset</h3>

In [7]:
comments = pd.read_csv("/Users/mazichang/Desktop/Cyberviolence/Project/CommentsCleaned.csv")
comments

Unnamed: 0,content,label
0,future female average yr old oh um women knock...,1
1,possible tatar relation seems minor connection...,0
2,somehow doubt statistic made head,0
3,guys need stop gay martins argent godaddy ads ...,0
4,whatever need get hold everyone plan something...,0
...,...,...
19995,plenty people taken stance favor common sense ...,0
19996,bitches mad mane nigga getting snatched whore ...,0
19997,done would even name known,0
19998,chicago killing field decades inept corrupt li...,0


<h3>Split dataset into train, validation, test</h3>

In [8]:
train, test = train_test_split(comments, test_size=0.1, random_state=31)
train, val = train_test_split(train, test_size=0.11111111, random_state=31)
# Ratio 8:1:1

training_features_range = train["content"].fillna('0')
training_labels_range = train["label"]
val_features_range = val["content"].fillna('0')
val_labels_range = val["label"]
testing_features_range = test["content"].fillna('0')
testing_labels_range = test["label"]

<h3>Train and fine tune naive bayes model</h3>

In [9]:
# Try different n_grams combination
n_grams = [(1,1),(1,2),(2,2),(1,3),(2,3),(3,3)]

for n_gram in n_grams:

    count_vectorizer = CountVectorizer(lowercase=True, ngram_range=n_gram)
    training_features = count_vectorizer.fit_transform(training_features_range)
    val_features = count_vectorizer.transform(val_features_range)

    classifier = MultinomialNB()
    classifier.fit(training_features, training_labels_range)
    predicted_labels = classifier.predict(val_features)

    # Evaluate based on f1 score        
    f1 = f1_score(val_labels_range, predicted_labels)
    print(n_gram, f1)
    # Unigram is the best

(1, 1) 0.5342281879194631
(1, 2) 0.33268101761252444
(2, 2) 0.32936507936507936
(1, 3) 0.2652631578947368
(2, 3) 0.32612966601178783
(3, 3) 0.10144927536231885


<h3>Predict using naive bayes model</h3>

In [10]:
count_vectorizer = CountVectorizer(lowercase=True, ngram_range=(1,1))
training_features = count_vectorizer.fit_transform(training_features_range)

classifier.fit(training_features, training_labels_range)
testing_features = count_vectorizer.transform(testing_features_range)
predicted_labels = classifier.predict(testing_features)

accuracy = accuracy_score(testing_labels_range, predicted_labels)
recall = recall_score(testing_labels_range, predicted_labels)
precision = precision_score(testing_labels_range, predicted_labels)
f1 = f1_score(testing_labels_range, predicted_labels)
print("Accuracy", accuracy)
print("Recall", recall)
print("Precision", precision)
print("F1", f1)

Accuracy 0.822
Recall 0.48
Precision 0.6017699115044248
F1 0.5340314136125655
