### Background

This notebook contains a simple logistic regression approach to solve a multilabel classification problem. The dataset and challenge is provided on Kaggle by Jigsaw. The jist of the competition is as follows:

"In this competition, you’re challenged to build a multi-headed model that’s capable of detecting different types of of toxicity like threats, obscenity, insults, and identity-based hate better than Perspective’s current models. You’ll be using a dataset of comments from Wikipedia’s talk page edits. Improvements to the current model will hopefully help online discussion become more productive and respectful."

Link: https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge

### Load data

In [None]:
import pandas as pd
import numpy as np
import nltk
from matplotlib import pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.linear_model import RidgeClassifier, LogisticRegression, RidgeClassifierCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.dummy import DummyClassifier
from nltk import word_tokenize, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from tqdm import tqdm, tqdm_pandas
import string
import re
import gensim
import collections

%matplotlib inline

In [None]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')
sample_submission_data = pd.read_csv('data/sample_submission.csv')

### Short exploration

In [None]:
train_data.head()
train_data.columns

In [None]:
print(train_data.comment_text[5000])
print(train_data.iloc[5000][['toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate']],'\n')

print(train_data.comment_text[4000])
print(train_data.iloc[4000][['toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate']],'\n')

print(train_data.comment_text[3000])
print(train_data.iloc[3000][['toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate']],'\n')

### Clean data: tokenize, remove stop words

In [None]:
stopset = set(stopwords.words('english'))
snow = SnowballStemmer('english')
WNlemma = WordNetLemmatizer()

def clean_text(x, normalization='stemming', remove_stop=False):
    """Function to preprocess text data. Removes punctuation and numbers. 
    Lemmatizes or stems words, depending on given parameter. Can also remove 
    stopwords if specified.
    
    Args:
        x (str): The piece of text to process.
        normalization (str): how to normalize words, 'stemming' (default) or 'lemmatization'.
        remove_stop (bool): whether to remove stopwords. Default is False.
        
    Returns:
        str: Preprocessed tokens, re-joined with spaces.
    """
    # split text
    words = word_tokenize(x)
    
    # remove punctuation and numbers
    words = [word for word in words if word not in string.punctuation and not bool(re.search(r'\d', word))]
    
    if normalization == 'stemming':
        words = [snow.stem(t) for t in words] # stemming
    elif normalization == 'lemmatization':
        words = [WNlemma.lemmatize(t.lower()) for t in words] # lemmatize words (advanced stemming)
    else:
        return 'Invalid parameter for normalization'
    
    # remove stop words
    if remove_stop:
        words = [word for word in words if word not in stopset]
    
    joined_words = ' '.join(words).replace('_', '')
    
    return joined_words

Apply text cleaning to column

In [None]:
#tqdm.pandas(tqdm()) # for tracking progress (use progress_apply in code below)
train_data['comment_text'] = train_data['comment_text'].apply(lambda x: clean_text(x, normalization='lemmatization'))

In [None]:
clean_text('This is a __test FUCK 99 !! .. fUcking ObSCENE languages shitz0r.', normalization='lemmatization')
# note: words with numbers in them currently get dropped. 
# suggestion: replace numbers in words with letters (e.g. 0 = o, 1 = i, 7 = t, 3 = e)

### Explore N-gram frequencies to better estimate appropriate min_df parameter for model

In [None]:
vect = CountVectorizer(ngram_range=(1,2))
train_vect = vect.fit_transform(train_data['comment_text'])
dist = np.sum(train_vect, axis=0).tolist()[0]
vocab = vect.get_feature_names()

In [None]:
ngram_freq = {}

for tag, count in zip(vocab, dist):
    ngram_freq[tag]=count
    
counts = collections.Counter(list(ngram_freq.values()))

In [None]:
# freq, occurrences of freq
# e.g. 2045516 words occur one time
counts.most_common()[:10]

### Split data

In [None]:
X = train_data['comment_text']
y = train_data[['toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

### Pipeline a few models

In [None]:
NB_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2), min_df=4, max_df=0.5, max_features=50000)),
                    ('tfidf', TfidfTransformer(use_idf=True)),
                    ('clf', OneVsRestClassifier(MultinomialNB(alpha=0.01), n_jobs=-1))])

SVC_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2), min_df=4, max_df=0.5, max_features=50000)),
                         ('tfidf', TfidfTransformer(use_idf=True)),
                       ('clf', OneVsRestClassifier(SVC(C=10, probability=True), n_jobs=-1))])

logistic_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2), min_df=4, max_df=0.5, max_features=15000)),
                         ('tfidf', TfidfTransformer(use_idf=True)),
                       ('clf', OneVsRestClassifier(LogisticRegression(C=0.1, class_weight='balanced'), n_jobs=-1))])

### Hyperparameter tuning

In [None]:
# Define parameters, specify for which part of pipeline with prefix, e.g. 'vect__'
SVC_params = {'vect__ngram_range': [(1,2)],
              'tfidf__use_idf': [True],
              'clf__estimator__C':[0.1, 1, 10]}

logistic_params = {#'vect__ngram_range': [(1,2)],
                   'vect__min_df': [3, 4, 5, 6],
                   'vect__max_df': [0.3, 0.4, 0.5, 0.6],
                   #'vect__max_features': [25000, 50000, 100000, None],
                    #'vect__max_features': [5000, 7500, 10000, 12500],
                  #'tfidf__use_idf': [True],
                  #'clf__estimator__C':[0.1, 0.3, 0.6, 1, 3],
                  #'clf__estimator__class_weight':['balanced', None],
                  #'clf__estimator__penalty':['l1', 'l2']
                  }

NB_params = {'vect__ngram_range': [(1,2)],
              'tfidf__use_idf': [True],
              'clf__estimator__alpha':[0.1, 1, 10]}

### New pipeline with optimal parameters

In [None]:
Pipeline([('vect', CountVectorizer(ngram_range=(1,2), min_df=6, max_df=0.3, max_features=25000)),
                         ('tfidf', TfidfTransformer(use_idf=True)),
                       ('clf', OneVsRestClassifier(LogisticRegression(C=0.1, class_weight='balanced'), n_jobs=-1))])

### Validation

In [None]:
# models to test
models = {'Logistic regression': logistic_clf,
          #'SVC': SVC_clf,
         #'Naïve Bayes': NB_clf
         }

In [None]:
# Benchmark score (in case of all 0 predictions):
pred = np.zeros(y_test.shape)
roc_auc_score(y_test, pred)

In [None]:
for model_name, model in models.items():
    print('Training {}...'.format(model_name))
    clf = model.fit(X_train, y_train)
    
    y_pred = clf.predict(X_train)
    print('{} train ROC_AUC score: {}'.format(model_name, roc_auc_score(y_train, y_pred)))
    
    y_pred = clf.predict(X_test)
    print('{} test ROC_AUC score: {}'.format(model_name, roc_auc_score(y_test, y_pred)))
    print('{} cross validation ROC_AUC score on 5 folds: {}'.format(model_name, cross_val_score(model, X, y, scoring='roc_auc', cv=5, n_jobs=-1).mean()))
    print('')

#### Validation log

### Final parameter tuning (manual)

In [None]:
logistic_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2), min_df=6, max_df=0.3, max_features=25000)),
                         ('tfidf', TfidfTransformer(use_idf=True)),
                       ('clf', OneVsRestClassifier(LogisticRegression(C=0.1, class_weight='balanced'), n_jobs=-1))])

print('Cross_val_score with C=0.1, max_features=25000, max_df=0.3, min_df=6: ', cross_val_score(logistic_clf, X, y, scoring='roc_auc', cv=5, n_jobs=-1).mean())

logistic_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2), min_df=5, max_df=0.3, max_features=25000)),
                         ('tfidf', TfidfTransformer(use_idf=True)),
                       ('clf', OneVsRestClassifier(LogisticRegression(C=0.1, class_weight='balanced'), n_jobs=-1))])

print('Cross_val_score with C=0.1, max_features=25000, max_df=0.4, min_df=5: ', cross_val_score(logistic_clf, X, y, scoring='roc_auc', cv=5, n_jobs=-1).mean())

#Cross_val_score with C=0.1:  0.9759436463859725
#Cross_val_score with C=0.01:  0.9635830622046198

#Cross_val_score with C=0.1, max_features=25000, max_df=0.5, min_df=4:  0.9760136616571229
#Cross_val_score with C=0.1, max_features=50000, max_df=0.5, min_df=4:  0.9759436463859725

#Cross_val_score with C=0.1, max_features=25000, max_df=0.3, min_df=4:  0.9762885351187846
#Cross_val_score with C=0.1, max_features=25000, max_df=0.4, min_df=4:  0.9760047347166096

#Cross_val_score with C=0.1, max_features=25000, max_df=0.3, min_df=6:  0.9763128416184881
#Cross_val_score with C=0.1, max_features=25000, max_df=0.3, min_df=5:  0.976302536076927

# Same test, but with correction in clean_text function (fixed lowercase issue):
# Cross_val_score with C=0.1, max_features=25000, max_df=0.3, min_df=6:  0.977167248668523
# Cross_val_score with C=0.1, max_features=25000, max_df=0.4, min_df=5:  0.9771515047562769

To do: write automated test function (start with default, set params to best so far, test one param per iteration)

### Declare model with final parameters to use

In [None]:
final_model = Pipeline([('vect', CountVectorizer(ngram_range=(1,2), min_df=6, max_df=0.3, max_features=25000)),
                         ('tfidf', TfidfTransformer(use_idf=True)),
                       ('clf', OneVsRestClassifier(LogisticRegression(C=0.1, class_weight='balanced'), n_jobs=-1))])

### Make predictions for submission and save

Note: submission should be probabilities

In [None]:
print(test_data.comment_text[0])

In [None]:
# clean test data
test_data['comment_text'] = test_data['comment_text'].apply(lambda x: clean_text(x, normalization='lemmatization'))

In [None]:
print(test_data.comment_text[0])

In [None]:
sample_submission_data.head(2)

In [None]:
y_pred_final = final_model.fit(X, y).predict_proba(test_data['comment_text'])

In [None]:
predictions = pd.DataFrame(y_pred_final, columns=y_test.columns)

In [None]:
predictions.head(2)

In [None]:
submission = pd.concat([test_data['id'], predictions], axis=1)

In [None]:
print('ID 00001cee341fdb12:\n',test_data.comment_text[0], '\n')
print('ID 0000247867823ef7:\n',test_data.comment_text[1])
submission.head(2)

In [None]:
submission.shape

In [None]:
TARGET_PATH = './data/submission_simple.csv'
submission.to_csv(TARGET_PATH, index=False)

My highest public leaderboard score on Kaggle: 0.0.9723 (Good for 3081nd place)

No. 1 score on the leaderboard: 0.9889

### Bonus: most important words per label 

In [None]:
all_estimators = final_model.named_steps['clf'].estimators_
vocab = final_model.named_steps['vect'].vocabulary_
index_to_words = {value: key for key,value in vocab.items()}

In [None]:
for index, label in enumerate(y.columns):
    print('Current label: {}'.format(label))
    words = {}
    coefs = all_estimators[index].coef_[0]
    for key in index_to_words.keys():
        words[index_to_words[key]] = coefs[key]
    words = sorted(words.items(), key=lambda x:x[1], reverse=True)
    top_5_most_important = words[:10]
    top_5_least_important = words[-10:]
    print('Top 10 most {} words:'.format(label))
    for pair in top_5_most_important:
        print(pair)
    print('')
    print('Top 10 least {} words:'.format(label))
    for pair in top_5_least_important:
        print(pair)
    print('\n')