# Introduction

This kernel shows how to use NBSVM (Naive Bayes - Support Vector Machine) to create a strong baseline. In this kernel, we use sklearn's logistic regression, rather than SVM, although in practice the two are nearly identical (sklearn uses the liblinear library behind the scenes).

In [1]:
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re, string
from sklearn.externals import joblib
from sklearn.pipeline import make_union
import sys, os
sys.path.append(os.path.dirname(os.getcwd()))
from score import calc_auc_score, calc_log_loss
import nbsvm

# Take Input

In [2]:
# read the train and test data
def readInputFiles(train_file_path, test_file_path):
    train = pd.read_csv(train_file_path)
    test = pd.read_csv(test_file_path)
    return train, test
    
train, test = readInputFiles('../dataset/train_new.csv', '../dataset/test_new.csv')

# Preprocessing Data

In [3]:
COMMENT = 'comment_text'

def preProcessData(trainData, testData):
    # create a list of all the labels to predict
    label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    
    # create a 'none' label so we can see how many comments have no labels
    train['none'] = 1-train[label_cols].max(axis=1)
    
    train.describe()
    
    # get rid of the empty comments, otherwise sklearn complains
    train[COMMENT].fillna("unknown", inplace=True)
    test[COMMENT].fillna("unknown", inplace=True)
    return label_cols

label_cols = preProcessData(train, test)
train.head()

Unnamed: 0.1,Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
0,140030,ed56f082116dcbd0,Grandma Terri Should Burn in Trash \nGrandma T...,1,0,0,0,0,0,0
1,159124,f8e3cd98b63bf401,", 9 May 2009 (UTC)\nIt would be easiest if you...",0,0,0,0,0,0,1
2,60006,a09e1bcf10631f9a,"""\n\nThe Objectivity of this Discussion is dou...",0,0,0,0,0,0,1
3,65432,af0ee0066c607eb8,Shelly Shock\nShelly Shock is. . .( ),0,0,0,0,0,0,1
4,154979,b734772b1a807e09,I do not care. Refer to Ong Teng Cheong talk p...,0,0,0,0,0,0,1


# Feature Extraction

In [4]:
def tokenize(s): 
    re_tok = re.compile('([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
    return re_tok.sub(r' \1 ', s).split()

In [5]:
# create bag of words representation, as a term document matrix using ngrams
def wordRepresentation(trainData, testData, word=True, char=False):
    # TF-IDF gives even better priors than the binarized features. 
    # it improves leaderboard score from 0.59 to 0.55.
    
     # Vectorizer using word ngram
    word_vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1, analyzer='word')

    # Vectorizer using char ngram 
    char_vec = TfidfVectorizer(ngram_range=(2,4),
                   min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
                   smooth_idf=1, sublinear_tf=1, analyzer='char')

    # use both char and word ngrams
    if word and char:
        vec = make_union(word_vec, char_vec, n_jobs=2)    
    elif char:
        vec = char_vec
    else:
        vec = word_vec
    
    # Extracting features from the training data using a sparse vectorizer"
    train_term_doc = vec.fit_transform(trainData[COMMENT])

    # Extracting features from the test data using the same vectorizer
    test_term_doc = vec.transform(testData[COMMENT])

    return train_term_doc, test_term_doc, vec

# Save results to a file

In [6]:
# create the submission file and store the results
def saveResults(predictions, filename):
    submission = pd.DataFrame.from_dict({'id': test['id']})
    for idx, col in enumerate(label_cols):
        submission[col] = predictions[:,idx]
    submission.to_csv(filename, index=False)

# Calculate Log loss / AUC Score

In [7]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [8]:
def get_scores(test, preds, fallback_preds_filename):
    try: 
        true = test
    except NameError:
        true = pd.read_csv('../dataset/test_new.csv')
    try: 
        y_pred = preds
    except NameError:
        pred = pd.read_csv(fallback_preds_filename)
        y_pred = pred[list_classes].values

    y_true = true[list_classes].values

    loss = calc_log_loss(y_true, y_pred)
    auc = calc_auc_score(y_true, y_pred)
    return loss, auc

# Predict Model

In [9]:
def predict(model_file, train_x, test_x):
    # Train the model only if it doesn't exist already
    if os.path.isfile(model_file):
        print ("Loading model from saved file " + model_file)
        mdl = joblib.load(model_file)   
    else:
        print ("Model doesn't exist. Training mode and saving in file " + model_file)
        mdl = nbsvm.train_model(train, train_x, label_cols, model_file)
    preds = nbsvm.get_preds_from_model(mdl, test_x, label_cols)
    print ("Done predicting")
    return preds

# Extract Features for Word n-grams, Char n-grams and Word+Char n-grams

In [11]:
train_term_doc = [None] * 3
test_term_doc = [None] * 3
vec = [None] * 3

# Get Baseline NB-SVM Model (Word n-grams)
train_term_doc[0], test_term_doc[0], vec[0] = wordRepresentation(train, test)

# Get NB-SVM Model with Character n-grams
train_term_doc[1], test_term_doc[1], vec[1] = wordRepresentation(train, test, word=False, char=True)

# Get NB-SVM Model with Character and Word n-grams
train_term_doc[2], test_term_doc[2], vec[2] = wordRepresentation(train, test, word=True, char=True)

## Basic Naive Bayes Model

In [12]:
train_x = train_term_doc[0]
test_x = test_term_doc[0]
model_file = "baseline_nb_svm_v2.pkl"
results_file = "submission_baseline_v2.csv"
preds = predict(model_file, train_x, test_x)
saveResults(preds, results_file)
loss_basic, auc_basic = get_scores(test, preds, fallback_preds_filename=results_file)

Model doesn't exist. Training mode and saving in file baseline_nb_svm_v2.pkl
fitting toxic
fitting severe_toxic
fitting obscene
fitting threat
fitting insult
fitting identity_hate
Done predicting


## Character n-grams Naive Bayes Model

In [None]:
train_x = train_term_doc[1]
test_x = test_term_doc[1]
model_file = "baseline_nb_svm_char_ngrams_v2.pkl"
results_file = "submission_baseline_char_ngrams_v2.csv"
preds = predict(model_file, train_x, test_x)
saveResults(preds, results_file)
loss_char_ngrams, auc_char_ngrams = get_scores(test, preds, fallback_preds_filename=results_file)

Model doesn't exist. Training mode and saving in file baseline_nb_svm_char_ngrams_v2.pkl
fitting toxic
fitting severe_toxic


## Word + Char n-grams Naive Bayes Model

In [None]:
train_x = train_term_doc[2]
test_x = test_term_doc[2]
model_file = "baseline_nb_svm_wordchar_ngrams_v2.pkl"
results_file = "submission_baseline_wordchar_ngrams_v2.csv"
preds = predict(model_file, train_x, test_x)
saveResults(preds, results_file)
loss_wordchar_ngrams, auc_wordchar_ngrams = get_scores(test, preds, fallback_preds_filename=results_file)

# Get AUC Scores for All 3 NB Models

In [20]:
print ("For Baseline")
print ("Log loss =", loss_basic)
print ("AUC Score = ", auc_basic)

For Baseline
('Log loss =', 0.07104368810823293)
('AUC Score = ', 0.9510252050191966)


In [21]:
print ("For Baseline")
print ("Log loss =", loss_char_ngrams)
print ("AUC Score = ", auc_char_ngrams)

For Baseline
('Log loss =', 0.07261229052475593)
('AUC Score = ', 0.9494655611791685)


In [22]:
print ("For Baseline")
print ("Log loss =", loss_wordchar_ngrams)
print ("AUC Score = ", auc_wordchar_ngrams)

For Baseline
('Log loss =', 0.06807725729517826)
('AUC Score = ', 0.9563577271367983)
