# Toxic Comment Classification

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack
import nltk
import spacy 

import re
import string
import gc
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
!python -m spacy download en_core_web_lg

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [4]:
!python -m spacy link en_core_web_lg en --force

[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_lg -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [5]:
import en_core_web_lg

In [36]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [37]:
df.isna().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [38]:
df = df.drop(columns='id')
df.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [39]:
categories = df.columns[1:]
categories

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

# Preprocessing

In [40]:
def remove_abbreviation(text):
    text = re.sub("\n"," ",text)
    text = re.sub("\[.*\]"," ",text)
    text = re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"," ",text)
    text = re.sub(r"\?"," ",text)
    text = re.sub("don't","do not",text)
    text = re.sub("doesn't", "does not",text)
    text = re.sub("didn't", "did not",text)
    text = re.sub("hasn't", "has not",text)
    text = re.sub("haven't", "have not",text)
    text = re.sub("hadn't", "had not",text)
    text = re.sub("won't", "will not",text)
    text = re.sub("wouldn't", "would not",text)
    text = re.sub("can't", "can not",text)
    text = re.sub("cannot", "can not",text)
    text = re.sub("i'm", "i am",text)
    text = re.sub("i'll", "i will",text)
    text = re.sub("its", "it is",text)
    text = re.sub("it's", "it is",text)
    text = re.sub("that's", "that is",text)
    text = re.sub("weren't", "were not",text)
    text = re.sub("i'd","i would",text)
    text = re.sub("i've","i have",text)
    text = re.sub("she'd","she would",text)
    text = re.sub("they'll","they will",text)
    text = re.sub("they're","they are",text)
    text = re.sub("we'd","we would",text)
    text = re.sub("we'll","we will",text)
    text = re.sub("we've","we have",text)
    text = re.sub("it'll","it will",text)
    text = re.sub("there's","there is",text)
    text = re.sub("where's","where is",text)
    text = re.sub("they're","they are",text)
    text = re.sub("let's","let us",text)
    text = re.sub("couldn't","could not",text)
    text = re.sub("shouldn't","should not",text)
    text = re.sub("wasn't","was not",text)
    text = re.sub("could've","could have",text)
    text = re.sub("might've","might have",text)
    text = re.sub("must've","must have",text)
    text = re.sub("should've","should have",text)
    text = re.sub("would've","would have",text)
    text = re.sub("who's","who is",text)
    text = re.sub("\bim\b", "i am",text)
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub("\d+", "", text)
    return text

In [42]:
def to_lower(text):
  return text.lower()

In [43]:
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

In [44]:
from  spacy.lang.en.stop_words import STOP_WORDS
stopwords = list(STOP_WORDS)

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stopwords])

In [45]:
nlp = spacy.load('en')
  
def lemmatize(text):
  doc = nlp(text)
  tokens = [] 
  for token in doc: 
      tokens.append(token)
  return " ".join([token.lemma_ for token in doc])

In [46]:
def preprocessing_pipeline(text):
  text = remove_abbreviation(text)
  text = to_lower(text)
  text = remove_punctuation(text)
  text = remove_stopwords(text)
  text = lemmatize(text)
  return text

df['comment_text'] = df.loc[:,'comment_text'].apply(lambda text : preprocessing_pipeline(text))

In [47]:
df.to_csv('processed.csv', index=False)

In [18]:
del remove_abbreviation
del to_lower
del remove_punctuation
del remove_stopwords
del lemmatize
del preprocessing_pipeline

del PUNCT_TO_REMOVE
del stopwords
del nlp

gc.collect()

6

# Text to Numeric Conversion

In [19]:
X = df.iloc[:,0].values.astype(str)
X

array(['explanation edit username hardcore metallica fan revert vandalism closure gas vote new york dolls fac remove template talk page -PRON- be retire',
       'daww match background colour -PRON- be seemingly stick thank talk january utc',
       'hey man -PRON- be try edit war guy constantly remove relevant information talk edit instead talk page care format actual info',
       ...,
       'spitzer umm there s actual article prostitution ring crunch captain',
       'look like actually speedy version delete look',
       'think understand come idea bad right away kind community go bad idea away instead help rewrite'],
      dtype='<U5000')

In [20]:
Y = df.iloc[:,1:].values
Y.shape

(159571, 6)

In [21]:
del df
gc.collect()

0

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X,Y)

In [23]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 2),
    max_features=10000,
    dtype=np.float32
)

In [24]:
X_train_word = word_vectorizer.fit_transform(X_train)

In [25]:
X_test_word = word_vectorizer.transform(X_test)

In [26]:
X_train_word

<119678x10000 sparse matrix of type '<class 'numpy.float32'>'
	with 2682801 stored elements in Compressed Sparse Row format>

In [27]:
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(2, 6),
    max_features=20000,
    dtype=np.float32)

In [28]:
X_train_char = char_vectorizer.fit_transform(X_train)

In [29]:
X_test_char = char_vectorizer.transform(X_test)

In [30]:
X_train_char

<119678x20000 sparse matrix of type '<class 'numpy.float32'>'
	with 59949958 stored elements in Compressed Sparse Row format>

In [31]:
X_train = hstack([X_train_word, X_train_char])
X_test = hstack([X_test_word, X_test_char])

In [32]:
del X_train_char
del X_test_char
del X_test_word
del X_train_word

del X
del Y

del char_vectorizer
del word_vectorizer

gc.collect()

0

# Logistic Regression

In [33]:
scores = 0
for index,category in enumerate(categories):
    logistic_regression = LogisticRegression(C=0.1, solver='sag')
    score = np.mean(cross_val_score(logistic_regression, X_train, y_train[:,index], cv=3, scoring='roc_auc'))
    scores += score
    print(f"{category} : {score}")

print("\nAverage Score : {:.3f}".format(scores/6))

toxic : 0.9627297885018485
severe_toxic : 0.986691263692853
obscene : 0.9852910637980714
threat : 0.9717477404550712
insult : 0.9746894746740464
identity_hate : 0.9747456394951124

Average Score : 0.976
