
In order to keep it simple we are going to use a `sklearn` model.

## Getting data

Dataset `train.csv` comes from kaggle: https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge. We locally store in `data` directory.

In [2]:
import pandas as pd

comments_df = pd.read_csv("data/jigsaw-toxic-comment-classification-challenge/train.csv")
comments_df.head(2)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0


## Predict if comment is toxic

### Train - validation split

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(comments_df[['comment_text']], comments_df['toxic'], random_state=10)
X_train.head(2)

Unnamed: 0,comment_text
34852,"This is a straw man argument, Mr Merkey. Nobo..."
17133,"ARC Gritt, the fucking cunt of all cunts, ruin..."


## Text preprocessing

In [10]:
import re

import nltk
from nltk.stem import SnowballStemmer

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
GOOD_SYMBOLS = "€\?"
GOOD_SYMBOLS_RE = re.compile('([' + GOOD_SYMBOLS + '])')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z '+ GOOD_SYMBOLS + ']')
ADD_SPACES_SYMBOLS_RE = re.compile("([\?])")
STEMMER = SnowballStemmer('english')

class TextPreprocessor:
        
    def transfrom_text(self, text):
        text = re.sub(GOOD_SYMBOLS_RE, r"\1", text) #process good symbols
        text = text.lower()
        text = re.sub(REPLACE_BY_SPACE_RE, " ", text) # process bad symbols
        text = re.sub(BAD_SYMBOLS_RE, "", text) # process bad symbols
        text = re.sub(ADD_SPACES_SYMBOLS_RE, r" \1 ", text)
        test = " ".join([STEMMER.stem(word) for word in text.split()])
        return text
    
    def transform(self, series):
        return series.apply(lambda text: self.transfrom_text(text))

In [17]:
preprocessor = TextPreprocessor()
X_train_preprocessed = preprocessor.transform(X_train['comment_text'])
X_test_preprocessed = preprocessor.transform(X_test['comment_text'])

In [45]:
print(X_train["comment_text"][:2])
print(X_train_preprocessed[:2])

34852    This is a straw man argument, Mr Merkey.  Nobo...
17133    ARC Gritt, the fucking cunt of all cunts, ruin...
Name: comment_text, dtype: object
34852    this is a straw man argument  mr merkey  nobod...
17133    arc gritt  the fucking cunt of all cunts  ruin...
Name: comment_text, dtype: object


## Bag of words

In [18]:
from sklearn.feature_extraction.text import   CountVectorizer

class BoW:

    def __init__(self):
        self.vectorizer =  CountVectorizer()
        
    def fit(self, column):
        self.vectorizer.fit(column)
        
    def transform(self, column):
        return self.vectorizer.transform(column)

In [20]:
vectorizer = BoW()
vectorizer.fit(X_train_preprocessed)
X_train_vectorized = vectorizer.transform(X_train_preprocessed)

In [22]:
X_test_vectorized = vectorizer.transform(X_test_preprocessed)

In [23]:
from sklearn.linear_model import LogisticRegression
    
class LogRegModel:
    
    def __init__(self):
        self.model = LogisticRegression(class_weight='balanced')
        
    def fit(self, X, y):
        self.model.fit(X, y)
        
    def predict(self, X):
        return self.model.predict(X)

In [None]:
model = LogRegModel()
model.fit(X_train_vectorized, y_train)

In [26]:
y_test_hat = model.predict(X_test_vectorized)

In [46]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,\
    average_precision_score, roc_auc_score, recall_score

def scores(y, predicted):
    return {
        'accuracy': accuracy_score(y, predicted),
        'precision': precision_score(y, predicted),
        'recall': recall_score(y, predicted),
        'f1-score': f1_score(y, predicted),
        #"roc_auc": roc_auc_score(y, predicted_score),
        'average-Precision': average_precision_score(y, predicted)}

In [47]:
scores(y_test, y_test_hat)

{'accuracy': 0.9362795477903392,
 'precision': 0.6142581888246628,
 'recall': 0.8551502145922747,
 'f1-score': 0.7149585108768781,
 'average-Precision': 0.5388192313485104}

In [37]:
class CompleteModel:
    
    def __init__(self, preprocessor, vectorizer, model, colname="comment_text"):
        self.colname = colname
        self.preprocessor = preprocessor
        self.vectorizer = vectorizer
        self.model = model
           
    def fit(self, X, y):
        print("preprocessor...")
        X_fe = pd.DataFrame({self.colname: self.preprocessor.transform(X[self.colname])})
        print("vectorizer...")
        self.vectorizer.fit(X_fe[self.colname])
        print("model...")
        X_fe = self.vectorizer.transform(X[self.colname])
        self.model.fit(X_fe, y)
        return self
        
    def predict(self, X):
        X_fe = pd.DataFrame({self.colname: self.preprocessor.transform(X[self.colname])})        
        X_fe = self.vectorizer.transform(X_fe[self.colname])
        return self.model.predict(X_fe)

In [38]:
complete_model = CompleteModel(preprocessor, vectorizer, model)

In [40]:
complete_model.fit(X_train, y_train)

preprocessor...
vectorizer...
model...




<__main__.CompleteModel at 0x7fe6c85a9668>

In [None]:
y_test_hat = complete_model.predict(X_test)
scores(y_test, y_test_hat)

## TfIdf

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(min_df=4, max_df=0.9, ngram_range=(1, 2), token_pattern='(\S+)')
complete_tfidf_model = CompleteModel(preprocessor, tfidf_vectorizer, model)

In [None]:
complete_model.fit(X_train, y_train)

preprocessor...
