In [7]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

<h3>Read the dataset</h3>

In [8]:
comments = pd.read_csv("/Users/mazichang/Desktop/Cyberviolence/Project/CommentsCleaned.csv")
comments

Unnamed: 0,content,label
0,future female average yr old oh um women knock...,1
1,possible tatar relation seems minor connection...,0
2,somehow doubt statistic made head,0
3,guys need stop gay martins argent godaddy ads ...,0
4,whatever need get hold everyone plan something...,0
...,...,...
19995,plenty people taken stance favor common sense ...,0
19996,bitches mad mane nigga getting snatched whore ...,0
19997,done would even name known,0
19998,chicago killing field decades inept corrupt li...,0


<h3>Split dataset into train, validation, test</h3>

In [9]:
train, test = train_test_split(comments, test_size=0.1, random_state=31)
train, val = train_test_split(train, test_size=0.11111111, random_state=31)
# Ratio 8:1:1

training_features_range = train["content"].fillna('0')
training_labels_range = train["label"]
val_features_range = val["content"].fillna('0')
val_labels_range = val["label"]
testing_features_range = test["content"].fillna('0')
testing_labels_range = test["label"]

<h3>Train and fine tune logistic regression model</h3>

In [10]:
# Try different n_grams combination
n_grams = [(1,1),(1,2),(2,2),(1,3),(2,3),(3,3)]

for n_gram in n_grams:
    
    # Use tf_idf to vectorize dataset
    tfidf_vectorizer = TfidfVectorizer(lowercase=True, ngram_range=n_gram)
    training_features = tfidf_vectorizer.fit_transform(training_features_range)
    val_features = tfidf_vectorizer.transform(val_features_range)

    classifier = LogisticRegression(class_weight='balanced')
    classifier.fit(training_features, training_labels_range)

    predicted_probs = classifier.predict_proba(val_features)
    predicted_labels = []
    for i in range(0, len(predicted_probs)):
        predicted_prob_no = predicted_probs[i][0]
        if predicted_prob_no <= 0.5:
            predicted_labels.append(1)
        else:
            predicted_labels.append(0)

    # Evaluate based on f1 score        
    f1 = f1_score(val_labels_range, predicted_labels)
    print(n_gram, f1)
    # Unigram is the best

(1, 1) 0.600418410041841
(1, 2) 0.5816649104320338
(2, 2) 0.37789661319073087
(1, 3) 0.5798319327731093
(2, 3) 0.3198458574181117
(3, 3) 0.12380952380952381


In [11]:
tfidf_vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(1,1))
training_features = tfidf_vectorizer.fit_transform(training_features_range)
val_features = tfidf_vectorizer.transform(val_features_range)

classifier.fit(training_features, training_labels_range)

predicted_probs = classifier.predict_proba(val_features)

for threshold in range (10):
    predicted_labels = []
    threshold = threshold/10
    for i in range(0, len(predicted_probs)):
        predicted_prob_no = predicted_probs[i][0]
        if predicted_prob_no <= threshold:
            predicted_labels.append(1)
        else:
            predicted_labels.append(0)

    # Evaluate based on f1 score        
    f1 = f1_score(val_labels_range, predicted_labels)
    print(threshold, f1)

0.0 0.0
0.1 0.11707317073170731
0.2 0.3219315895372234
0.3 0.495176848874598
0.4 0.5814863102998695
0.5 0.600418410041841
0.6 0.5665818490245972
0.7 0.4903047091412742
0.8 0.41765704584040747
0.9 0.3546099290780142


<h3>Predict using logistic regression model</h3>

In [12]:
testing_features = tfidf_vectorizer.transform(testing_features_range)
predicted_probs = classifier.predict_proba(testing_features)
predicted_labels = []
for i in range(0, len(predicted_probs)):
    predicted_prob_no = predicted_probs[i][0]
    if predicted_prob_no <= 0.5:
        predicted_labels.append(1)
    else:
        predicted_labels.append(0)
    
accuracy = accuracy_score(testing_labels_range, predicted_labels)
recall = recall_score(testing_labels_range, predicted_labels)
precision = precision_score(testing_labels_range, predicted_labels)
f1 = f1_score(testing_labels_range, predicted_labels)
print("Accuracy", accuracy)
print("Recall", recall)
print("Precision", precision)
print("F1", f1)

Accuracy 0.8075
Recall 0.7223529411764706
Precision 0.5348432055749129
F1 0.6146146146146148
