# Toxic Comment Classification

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from scipy.sparse import hstack
import nltk
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 
from  spacy.lang.en.stop_words import STOP_WORDS

import re
import string
import gc
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [34]:
useLogisticRegression = True
useNaiveBayes = False

In [3]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
df.dtypes

id               object
comment_text     object
toxic             int64
severe_toxic      int64
obscene           int64
threat            int64
insult            int64
identity_hate     int64
dtype: object

In [5]:
df = df.drop(columns='id')
df.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
categories = df.columns[1:]
categories

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [45]:
test_data = pd.read_csv('test.csv')
test_data.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [46]:
ids = test_data.iloc[:,0]
test_data = test_data.drop(columns='id')
ids

0         00001cee341fdb12
1         0000247867823ef7
2         00013b17ad220c46
3         00017563c3f7919a
4         00017695ad8997eb
                ...       
153159    fffcd0960ee309b5
153160    fffd7a9a6eb32c16
153161    fffda9e8d6fafa9e
153162    fffe8f1340a79fc2
153163    ffffce3fb183ee80
Name: id, Length: 153164, dtype: object

In [11]:
test_data.head()

Unnamed: 0,comment_text
0,Yo bitch Ja Rule is more succesful then you'll...
1,== From RfC == \n\n The title is fine as it is...
2,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,":If you have a look back at the source, the in..."
4,I don't anonymously edit articles at all.


# Preprocessing

In [12]:
def to_lower(text):
  return text.lower()

In [13]:
def remove_abbreviation(text):
    text = re.sub("^ *","", text)
    text = re.sub("\n"," ",text)
    text = re.sub(' {2,}', ' ', text)
    text = re.sub("\[.*\]"," ",text)
    text = re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"," ",text)
    text = re.sub(r"\?"," ",text)
    text = re.sub("don'?t","do not",text)
    text = re.sub("doesn'?t", "does not",text)
    text = re.sub("didn'?t", "did not",text)
    text = re.sub("hasn'?t", "has not",text)
    text = re.sub("haven'?t", "have not",text)
    text = re.sub("hadn'?t", "had not",text)
    text = re.sub("won'?t", "will not",text)
    text = re.sub("wouldn'?t", "would not",text)
    text = re.sub("can'?t", "can not",text)
    text = re.sub("cannot", "can not",text)
    text = re.sub("i'?m", "i am",text)
    text = re.sub("i'?ll", "i will",text)
    text = re.sub("it'?s", "it is",text)
    text = re.sub("that'?s", "that is",text)
    text = re.sub("weren'?t", "were not",text)
    text = re.sub("i'?d","i would",text)
    text = re.sub("i'?ve","i have",text)
    text = re.sub("she'?d","she would",text)
    text = re.sub("they'?ll","they will",text)
    text = re.sub("they'?re","they are",text)
    text = re.sub("we'?d","we would",text)
    text = re.sub("we'?ll","we will",text)
    text = re.sub("we'?ve","we have",text)
    text = re.sub("it'?ll","it will",text)
    text = re.sub("there'?s","there is",text)
    text = re.sub("where'?s","where is",text)
    text = re.sub("they'?re","they are",text)
    text = re.sub("let'?s","let us",text)
    text = re.sub("couldn'?t","could not",text)
    text = re.sub("shouldn'?t","should not",text)
    text = re.sub("wasn'?t","was not",text)
    text = re.sub("could'?ve","could have",text)
    text = re.sub("might'?ve","might have",text)
    text = re.sub("must'?ve","must have",text)
    text = re.sub("should'?ve","should have",text)
    text = re.sub("would'?ve","would have",text)
    text = re.sub("who'?s","who is",text)
    text = re.sub("you'?re", "you are", text)
    text = re.sub("y'?all", "you all", text)
    text = re.sub("'d've"," would have", text)
    text = re.sub("'d"," would", text)
    text = re.sub("'re"," are", text)
    text = re.sub("'ve"," have", text)
    text = re.sub("\bim\b", "i am",text)
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub("[^a-zA-Z ]+", "", text)
    return text

In [14]:
def remove_url(text):
  text = re.sub(r"\b(?:(?:https|ftp|http|www)://)?\w[\w-]*(?:\.[\w-]+)+\S*", '', text, flags=re.MULTILINE)
  return text

In [15]:
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

In [16]:
stopwords = list(STOP_WORDS)

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stopwords])

In [17]:
def get_wordnet_pos(word):
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    text_arr = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in word_tokenize(text)]
    return " ".join(text_arr)

In [18]:
def preprocessing_pipeline(text):
  text = to_lower(text)
  text = remove_url(text)
  text = remove_punctuation(text)
  text = remove_abbreviation(text)
  text = remove_stopwords(text)
  text = lemmatize(text)
  return text

df['comment_text'] = df.loc[:,'comment_text'].apply(lambda text : preprocessing_pipeline(text))

In [19]:
test_data['comment_text'] = test_data.loc[:,'comment_text'].apply(lambda text : preprocessing_pipeline(text))

In [20]:
df.to_csv('processed_train.csv', index=False)
test_data.to_csv('processed_test.csv', index=False)

In [21]:
del remove_abbreviation
del to_lower
del remove_punctuation
del remove_stopwords
del remove_url
del get_wordnet_pos
del lemmatize
del preprocessing_pipeline

del PUNCT_TO_REMOVE
del stopwords

gc.collect()

7

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from scipy.sparse import hstack
import nltk
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 
from  spacy.lang.en.stop_words import STOP_WORDS

import re
import string
import gc
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
df = pd.read_csv('processed_train.csv', dtype={'comment_text':'string'})
df.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,explanation edit username hardcore metallica f...,0,0,0,0,0,0
1,daww match background colour seemingly stuck t...,0,0,0,0,0,0
2,hey man try edit war guy constantly remove rel...,0,0,0,0,0,0
3,real suggestion amprovement wonder section sta...,0,0,0,0,0,0
4,sir hero chance remember page,0,0,0,0,0,0


In [3]:
test_data = pd.read_csv('processed_test.csv', dtype={'comment_text':'string'}).fillna('')
test_data.head()

Unnamed: 0,comment_text
0,yo bitch ja rule succesful youll whats hat sad...
1,rfc title fine amo
2,source zawe ashton lapland
3,look source information update correct form gu...
4,anonymously edit article


In [4]:
categories = df.columns[1:]
categories

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [5]:
df.dtypes

comment_text     string
toxic             int64
severe_toxic      int64
obscene           int64
threat            int64
insult            int64
identity_hate     int64
dtype: object

In [6]:
df.isna().sum()

comment_text     161
toxic              0
severe_toxic       0
obscene            0
threat             0
insult             0
identity_hate      0
dtype: int64

In [7]:
df = df.dropna()

In [8]:
test_data.isna().sum()

comment_text    0
dtype: int64

# Text to Numeric Conversion

In [9]:
all_data = pd.concat([df, test_data])
all_data.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,explanation edit username hardcore metallica f...,0.0,0.0,0.0,0.0,0.0,0.0
1,daww match background colour seemingly stuck t...,0.0,0.0,0.0,0.0,0.0,0.0
2,hey man try edit war guy constantly remove rel...,0.0,0.0,0.0,0.0,0.0,0.0
3,real suggestion amprovement wonder section sta...,0.0,0.0,0.0,0.0,0.0,0.0
4,sir hero chance remember page,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
all_data.shape

(312574, 7)

In [11]:
X = all_data.iloc[:,0].values
X.shape

(312574,)

In [12]:
X_train = df.iloc[:,0].values
X_train.shape

(159410,)

In [13]:
X_test = test_data.iloc[:,0].values
X_test.shape

(153164,)

In [14]:
X.dtype

StringDtype

In [15]:
y_train = df.iloc[:,1:].values
y_train.shape

(159410, 6)

In [16]:
del df
del test_data
del all_data

gc.collect()

134

In [17]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 2),
    max_features=10000,
    dtype=np.float32,
    lowercase=False
)

In [18]:
word_vectorizer.fit(X)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float32'>, encoding='utf-8',
                input='content', lowercase=False, max_df=1.0,
                max_features=10000, min_df=1, ngram_range=(1, 2), norm='l2',
                preprocessor=None, smooth_idf=True, stop_words=None,
                strip_accents='unicode', sublinear_tf=True,
                token_pattern='\\w{1,}', tokenizer=None, use_idf=True,
                vocabulary=None)

In [19]:
X_train_word = word_vectorizer.transform(X_train)

In [20]:
X_test_word = word_vectorizer.transform(X_test)

In [21]:
X_train_word.shape

(159410, 10000)

In [22]:
X_test_word.shape

(153164, 10000)

In [23]:
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(2, 6),
    max_features=20000,
    lowercase=False,
    dtype=np.float32)

In [24]:
char_vectorizer.fit(X)

TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
                dtype=<class 'numpy.float32'>, encoding='utf-8',
                input='content', lowercase=False, max_df=1.0,
                max_features=20000, min_df=1, ngram_range=(2, 6), norm='l2',
                preprocessor=None, smooth_idf=True, stop_words=None,
                strip_accents='unicode', sublinear_tf=True,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
                vocabulary=None)

In [25]:
X_train_char = char_vectorizer.transform(X_train)

In [26]:
X_test_char = char_vectorizer.transform(X_test)

In [27]:
X_train_char.shape

(159410, 20000)

In [28]:
X_test_char.shape

(153164, 20000)

In [29]:
X_train = hstack([X_train_word, X_train_char,])
X_test = hstack([X_test_word, X_test_char])

In [None]:
del X_train_char
del X_test_char
del X_test_word
del X_train_word

del X

del char_vectorizer
del word_vectorizer

gc.collect()

# Logistic Regression

In [59]:
result = pd.DataFrame()
result.head()

In [61]:
scores = 0

param_grid = {
    'C' : np.arange(0.1,1.1,0.1)
}

C_values = [1, 0.4, 0.8, 0.9, 0.7000000000000001, 0.8]

for index,category in enumerate(categories):
    logistic_regression = LogisticRegression(C=C_values[index], solver='sag')
    # clf = GridSearchCV(logistic_regression, param_grid = param_grid, cv = 3, n_jobs=-1, scoring='roc_auc')
    # clf.fit(X_train, y_train[:,index])
    # print(clf.best_params_)
    score = np.mean(cross_val_score(logistic_regression, X_train, y_train[:,index], cv=3, scoring='roc_auc'))
    scores += score
    if useLogisticRegression:
        logistic_regression.fit(X_train, y_train[:,index])
        y_pred = logistic_regression.predict_proba(X_test)
        result[f'{category}'] = y_pred[:, 1]
    print(f"{category} : {score}")

print("\nAverage Score : {:.5f}".format(scores/6))

toxic : 0.9728802017802024
severe_toxic : 0.9870593765177395
obscene : 0.989004340556181
threat : 0.9839706973456201
insult : 0.9787075209333508
identity_hate : 0.9809475241288071

Average Score : 0.98209


In [62]:
result.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0.999674,0.137115,0.999369,0.040731,0.965501,0.195344
1,0.006735,0.004155,0.004595,0.000979,0.009395,0.002142
2,0.005942,0.00194,0.002985,0.000415,0.003825,0.000691
3,0.002701,0.001695,0.002037,0.000345,0.002597,0.000311
4,0.025511,0.001955,0.005105,0.000409,0.014556,0.001221


# Multinomial Naive Bayes

In [33]:
scores = 0

param_grid = {
    'alpha' : np.arange(0.1,1.5,0.15)
}

alpha_values = [0.9999999999999999, 0.25, 0.85, 0.1, 0.7, 0.25]

for index,category in enumerate(categories):
    multinomial_nb = MultinomialNB()
    # clf = GridSearchCV(multinomial_nb, param_grid = param_grid, cv = 3, n_jobs=-1, scoring='roc_auc')
    # clf.fit(X_train, y_train[:,index])
    # print(clf.best_params_)
    score = np.mean(cross_val_score(multinomial_nb, X_train, y_train[:,index], cv=3, scoring='roc_auc'))
    scores += score
    print(f"{category} : {score}")

print("\nAverage Score : {:.5f}".format(scores/6))

toxic : 0.9475619438752069
severe_toxic : 0.9783054844992479
obscene : 0.9643377431664906
threat : 0.9230518776136027
insult : 0.959510772731179
identity_hate : 0.9547280513220432

Average Score : 0.95458


# Kaggle Submission

In [39]:
!pip install kaggle



In [None]:
from google.colab import files
files.upload()

In [41]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/

In [42]:
! chmod 600 ~/.kaggle/kaggle.json

In [63]:
result.insert(0, 'id', ids)
result.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999674,0.137115,0.999369,0.040731,0.965501,0.195344
1,0000247867823ef7,0.006735,0.004155,0.004595,0.000979,0.009395,0.002142
2,00013b17ad220c46,0.005942,0.00194,0.002985,0.000415,0.003825,0.000691
3,00017563c3f7919a,0.002701,0.001695,0.002037,0.000345,0.002597,0.000311
4,00017695ad8997eb,0.025511,0.001955,0.005105,0.000409,0.014556,0.001221


In [64]:
result.shape

(153164, 7)

In [65]:
result.to_csv('submission.csv', index=False)

In [67]:
! kaggle competitions submit -c jigsaw-toxic-comment-classification-challenge -f submission.csv -m "Second Submission"

100% 13.3M/13.3M [00:02<00:00, 5.42MB/s]
Successfully submitted to Toxic Comment Classification Challenge