# Toxic Comment Classification

## Import Libraries


In [1]:
# Ignoring unnecessory warnings
import warnings
warnings.filterwarnings("ignore")  
# Specialized container datatypes
import collections
# For data vizualization 
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# For large and multi-dimensional arrays
import numpy as np
# For data manipulation and analysis
import pandas as pd
# Natural language processing library
import nltk
nltk.download('genesis')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
import gensim
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import STOPWORDS
# For random selection 
import random
# For basic cleaning and data preprocessing 
import re
import string
# Machine Learning Library
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics import accuracy_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

[nltk_data] Downloading package genesis to
[nltk_data]     /Users/pantelis/nltk_data...
[nltk_data]   Package genesis is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pantelis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/pantelis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/pantelis/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Load Datasets

At first, let's load our train set to pandas dataframe for ease in data manipulation

In [2]:
TRAIN_DATASET = "./data/train.csv"
train_df = pd.read_csv(TRAIN_DATASET, encoding='utf-8')
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3947 entries, 0 to 3946
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Insult   3947 non-null   int64 
 1   Date     3229 non-null   object
 2   Comment  3947 non-null   object
dtypes: int64(1), object(2)
memory usage: 92.6+ KB


And let's have a look at the very first 5 rows of our train set

In [3]:
train_df.head()

Unnamed: 0,Insult,Date,Comment
0,1,20120618192155Z,"""You fuck your dad."""
1,0,20120528192215Z,"""i really don't understand your point.\xa0 It ..."
2,0,,"""A\\xc2\\xa0majority of Canadians can and has ..."
3,0,,"""listen if you dont wanna get married to a man..."
4,0,20120619094753Z,"""C\xe1c b\u1ea1n xu\u1ed1ng \u0111\u01b0\u1edd..."


In order to evaluate our trained models, we are going to use a test set

In [4]:
TEST_DATASET = "./data/impermium_verification_labels.csv"
test_df = pd.read_csv(TEST_DATASET)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2235 entries, 0 to 2234
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       2235 non-null   int64 
 1   Insult   2235 non-null   int64 
 2   Date     2235 non-null   object
 3   Comment  2235 non-null   object
 4   Usage    2235 non-null   object
dtypes: int64(2), object(3)
memory usage: 87.4+ KB


In [5]:
test_df.head()

Unnamed: 0,id,Insult,Date,Comment,Usage
0,1,0,20120603163526Z,"""like this if you are a tribe fan""",PrivateTest
1,2,1,20120531215447Z,"""you're idiot.......................""",PrivateTest
2,3,1,20120823164228Z,"""I am a woman Babs, and the only ""war on women...",PrivateTest
3,4,1,20120826010752Z,"""WOW & YOU BENEFITTED SO MANY WINS THIS YEAR F...",PrivateTest
4,5,1,20120602223825Z,"""haha green me red you now loser whos winning ...",PrivateTest


## Data Cleaning

It is common to train our models when our data has already being cleaned and preproccessed. Data cleaning is a crucial step and of utter importance when it comes to training machine learning models that aim at high metrics score. The basic data cleaning steps that we are going to apply are:
1. Remove unnecessary columns
2. Drop rows with NaN values
3. Convert all characters to lowercase
4. Remove characters such as punctuation, characters such as "\ n", "\ u0111"
5. Remove urls if any

In [6]:
def data_cleaning(text):
    # convert text to lowercase
    text = text.lower()
    # remove all special characters, punctuation and spaces from string
    text = re.sub(r'\W+',' ', text)
    # remove urls if any
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text)
    # remove non ascii characters
    text = re.sub(r'(x[(a|c|e|f)|(0-9)]+|(u[(0-9)]+))', '', text)
    # return cleaned text
    return text

At first, let's perform data cleaning in the train set

In [7]:
# Drop Date column
train_df.drop(['Date'], axis=1, inplace=True)
# Drop any NaN value
train_df.dropna(inplace=True)
# Rest index 
train_df.reset_index(drop=True)

Unnamed: 0,Insult,Comment
0,1,"""You fuck your dad."""
1,0,"""i really don't understand your point.\xa0 It ..."
2,0,"""A\\xc2\\xa0majority of Canadians can and has ..."
3,0,"""listen if you dont wanna get married to a man..."
4,0,"""C\xe1c b\u1ea1n xu\u1ed1ng \u0111\u01b0\u1edd..."
...,...,...
3942,1,"""you are both morons and that is never happening"""
3943,0,"""Many toolbars include spell check, like Yahoo..."
3944,0,"""@LambeauOrWrigley\xa0\xa0@K.Moss\xa0\nSioux F..."
3945,0,"""How about Felix? He is sure turning into one ..."


In [8]:
# Clean Comment section
train_df['Comment'] = train_df['Comment'].apply(lambda x: data_cleaning(x))

In [9]:
train_df.head()

Unnamed: 0,Insult,Comment
0,1,you fuck your dad
1,0,i really don t understand your point it seem...
2,0,a majority of canadians can and has been wro...
3,0,listen if you dont wanna get married to a man...
4,0,c b ea1n xu ed1ng b0 eddng bi ec3u t nh 201...


It seems that train set is way more cleaned now. Let's perform the same procedure for the test set

In [10]:
# Drop Date column
test_df.drop(['id', 'Date', 'Usage'], axis=1, inplace=True)
# Drop any NaN value
test_df.dropna(inplace=True)
# Rest index 
test_df.reset_index(drop=True)

Unnamed: 0,Insult,Comment
0,0,"""like this if you are a tribe fan"""
1,1,"""you're idiot......................."""
2,1,"""I am a woman Babs, and the only ""war on women..."
3,1,"""WOW & YOU BENEFITTED SO MANY WINS THIS YEAR F..."
4,1,"""haha green me red you now loser whos winning ..."
...,...,...
2230,0,"""FUCKIN LAME COME ON WTF STOP FUCKING OVER MY ..."
2231,1,"""YOU SHUT YOUR IGNORANT PIE HOLE YOU LITTLE IN..."
2232,0,"""sweetie pie is looking very much like her cou..."
2233,1,"""ball4real where are you with your miami g-ayn..."


In [11]:
# Clean Comment section
test_df['Comment'] = test_df['Comment'].apply(lambda x: data_cleaning(x))

In [12]:
test_df.head()

Unnamed: 0,Insult,Comment
0,0,like this if you are a tribe fan
1,1,you re idiot
2,1,i am a woman babs and the only war on women i...
3,1,wow you benefitted so many wins this year fro...
4,1,haha green me red you now loser whos winning ...


Data cleaning for both train and test set has completed successfully. It's time for the feature extraction part, while training the different models (NB, SVM, RF) and seeing what benchmarks we can actually achieve.

## Naive Bayes

At first, we are going to try Naive Bayes classifier without any improvement. Specifically, using CountVectorizer (aka Bag of Words) we are going to extract features and train our model. 

It is important to mention that the effectiveness of our models wlll be evaluated only in the test set, inspecting accuracy and f1-score.

In [13]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_df['Comment'])
y_train = train_df['Insult']
X_test = vectorizer.transform(test_df['Comment'])
y_test = test_df['Insult']

In [14]:
clf = MultinomialNB(alpha=0.5)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy score: {:.4f}".format(accuracy_score(y_test,y_pred)))
print("F1-score: {:.4f}".format(f1_score(y_test,y_pred)))

Accuracy score: 0.6676
F1-score: 0.6664


### 1. Lemmatization

As a first improvement, lemmatization is going to be applied in both training and test set. 

In [15]:
lemmatizer = WordNetLemmatizer()

def lemmatize(text):
    lem_sentence = []
    token_words = word_tokenize(text)
    lem_sentence = [lemmatizer.lemmatize(word) for word in token_words]
    text = " ".join(lem_sentence)
    return text

In [16]:
train_comments_1 = train_df['Comment'].apply(lambda x: lemmatize(x))
test_comments_1 = test_df['Comment'].apply(lambda x: lemmatize(x))

In [17]:
# Extract BoW features once again, but now using lemmatized comments
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_comments_1)
X_test = vectorizer.transform(test_comments_1)

In [18]:
# Train and evaluate model
clf = MultinomialNB(alpha=0.5)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy score: {:.4f}".format(accuracy_score(y_test,y_pred)))
print("F1-score: {:.4f}".format(f1_score(y_test,y_pred)))

Accuracy score: 0.6711
F1-score: 0.6708


### 2. Stop Words

As a second improvement, we are going to remove stop words. Take notice that each improvement will be applied in the previous one and not on the naive text.

In [19]:
def clean_stopwords(text):
    removed = remove_stopwords(text)
    text = "".join(removed)
    return text

In [20]:
train_comments_2 = train_comments_1.apply(lambda x: lemmatize(x))
test_comments_2 = test_comments_1.apply(lambda x: lemmatize(x))

In [21]:
# Extract BoW features once again, but now using lemmatized and without stop words comments
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_comments_2)
X_test = vectorizer.transform(test_comments_2)

In [22]:
# Train and evaluate model
clf = MultinomialNB(alpha=0.5)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy score: {:.4f}".format(accuracy_score(y_test,y_pred)))
print("F1-score: {:.4f}".format(f1_score(y_test,y_pred)))

Accuracy score: 0.6698
F1-score: 0.6702


### 3. Bigrams

As a third improvement, we are going to use bigrams in our BoW feature extractor.

In [23]:
vectorizer = CountVectorizer(ngram_range=(1, 2))
X_train = vectorizer.fit_transform(train_comments_2)
X_test = vectorizer.transform(test_comments_2)

In [24]:
# Train and evaluate model
clf = MultinomialNB(alpha=0.5)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy score: {:.4f}".format(accuracy_score(y_test,y_pred)))
print("F1-score: {:.4f}".format(f1_score(y_test,y_pred)))

Accuracy score: 0.6738
F1-score: 0.6279


### 4. Laplace Smoothing 

The smoothing priors α >= 0 accounts for features not present in the learning samples and prevents zero probabilities in further computations. Setting α = 1 is called Laplace smoothing, while α < 1 is called Lidstone smoothing.

In [25]:
clf = MultinomialNB(alpha=1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy score: {:.4f}".format(accuracy_score(y_test,y_pred)))
print("F1-score: {:.4f}".format(f1_score(y_test,y_pred)))

Accuracy score: 0.6550
F1-score: 0.5190


## TF-IDF + Part-of-Speech

In this section we are going to extract a more sophisticated feature vector. For this tf-idf vectorizer will be used along Part-of-Speech Based Features, provided by NLTK functionalities.

For this expirement, we are going to train and evalute a SVM as well as a Random Decision Forest.

In [26]:
# Fit tf-idf vectorizer and extract features from the train set
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
train_tfidf_vec = tfidf_vectorizer.fit_transform(train_df['Comment'])
train_tfidf_data = pd.DataFrame(train_tfidf_vec.toarray())

In [27]:
# Extract tf-idf based features from the test set
test_tfidf_vec = tfidf_vectorizer.transform(test_df['Comment'])
test_tfidf_data = pd.DataFrame(test_tfidf_vec.toarray())

Using the part-of-speech tagger provided by the NTLK, tag each word in each comment with the part of its speech (noun, verb, adverb or adjective). Then, the percentage of specific tags for each text will be calculated. That is, the frequency of each POS tag in each sample, calculated in terms of the total number of words in the sample. This way we  will have 4 new features in each text, fractionAdverbs, fractionVerbs, fractionAdjectives, fractionNouns

In [28]:
def count_fractions(text, pos_tag):
    token_words = word_tokenize(text)
    tagged_token_words = nltk.pos_tag(token_words)
    count = sum([1 for _, tag in tagged_token_words if tag == pos_tag])
    return round(count/(len(text)), 3)*100

In [29]:
# Calculate adverbs fraction
train_df['fractionAdverbs'] = train_df['Comment'].apply(lambda x: count_fractions(x, 'RB'))
test_df['fractionAdverbs'] = test_df['Comment'].apply(lambda x: count_fractions(x, 'RB'))

In [31]:
# Calculate verbs fraction
train_df['fractionVerbs'] = train_df['Comment'].apply(lambda x: count_fractions(x, 'VB'))
test_df['fractionVerbs'] = test_df['Comment'].apply(lambda x: count_fractions(x, 'VB'))

In [32]:
# Calculate adjectives fraction
train_df['fractionAdjectives'] = train_df['Comment'].apply(lambda x: count_fractions(x, 'JJ'))
test_df['fractionAdjectives'] = test_df['Comment'].apply(lambda x: count_fractions(x, 'JJ'))

In [33]:
# Calculate nouns fraction
train_df['fractionNouns'] = train_df['Comment'].apply(lambda x: count_fractions(x, 'NN'))
test_df['fractionNouns'] = test_df['Comment'].apply(lambda x: count_fractions(x, 'NN'))

In [34]:
# Concat features to formulate the final data for train set
train_final_data = pd.concat([train_df['fractionAdverbs'], train_df['fractionVerbs'], 
                              train_df['fractionAdjectives'], train_df['fractionNouns'], 
                              train_tfidf_data], axis=1)
train_final_data.head()

Unnamed: 0,fractionAdverbs,fractionVerbs,fractionAdjectives,fractionNouns,0,1,2,3,4,5,...,92625,92626,92627,92628,92629,92630,92631,92632,92633,92634
0,0.0,0.0,0.0,5.3,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.1,1.1,1.1,3.4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.6,1.1,1.4,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.7,3.4,1.9,1.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,4.9,20.9,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
# Concat features to formulate the final data for test set
test_final_data = pd.concat([test_df['fractionAdverbs'], test_df['fractionVerbs'], 
                             test_df['fractionAdjectives'], test_df['fractionNouns'], 
                             test_tfidf_data], axis=1)
test_final_data.head()

Unnamed: 0,fractionAdverbs,fractionVerbs,fractionAdjectives,fractionNouns,0,1,2,3,4,5,...,92625,92626,92627,92628,92629,92630,92631,92632,92633,92634
0,0.0,0.0,2.9,2.9,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,7.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,2.1,3.2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.5,0.0,2.9,5.9,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.6,0.0,3.6,3.6,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Support Vector Machine

In [None]:
clf = SVC(gamma='auto')
clf.fit(train_final_data, y_train)
y_pred = clf.predict(test_final_data)
print("Accuracy score: {:.4f}".format(accuracy_score(y_test,y_pred)))
print("F1-score: {:.4f}".format(f1_score(y_test,y_pred)))

## Random Forest

In [None]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(train_final_data, y_train)
y_pred = clf.predict(test_final_data)
print("Accuracy score: {:.4f}".format(accuracy_score(y_test,y_pred)))
print("F1-score: {:.4f}".format(f1_score(y_test,y_pred)))