## Introduction

This kernel shows how to use NBSVM (Naive Bayes - Support Vector Machine) to create a strong baseline. In this kernel, we use sklearn's logistic regression, rather than SVM, although in practice the two are nearly identical (sklearn uses the liblinear library behind the scenes).

In [1]:
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re, string
from sklearn.externals import joblib
import sys, os
sys.path.append(os.path.dirname(os.getcwd()))
from score import calc_auc_score, calc_log_loss
import nbsvm

## Take Input

In [2]:
# read the train and test data
def readInputFiles(train_file_path, test_file_path):
    train = pd.read_csv(train_file_path)
    test = pd.read_csv(test_file_path)
    return train, test
    
train, test = readInputFiles('../dataset/train_new.csv', '../dataset/test_new.csv')

## Preprocessing Data

In [4]:
COMMENT = 'comment_text'

def preProcessData(trainData, testData):
    # create a list of all the labels to predict
    label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    
    # create a 'none' label so we can see how many comments have no labels
    train['none'] = 1-train[label_cols].max(axis=1)
    
    train.describe()
    
    # get rid of the empty comments, otherwise sklearn complains
    train[COMMENT].fillna("unknown", inplace=True)
    test[COMMENT].fillna("unknown", inplace=True)
    return label_cols

label_cols = preProcessData(train, test)
train.head()

Unnamed: 0.1,Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
0,140030,ed56f082116dcbd0,Grandma Terri Should Burn in Trash \nGrandma T...,1,0,0,0,0,0,0
1,159124,f8e3cd98b63bf401,", 9 May 2009 (UTC)\nIt would be easiest if you...",0,0,0,0,0,0,1
2,60006,a09e1bcf10631f9a,"""\n\nThe Objectivity of this Discussion is dou...",0,0,0,0,0,0,1
3,65432,af0ee0066c607eb8,Shelly Shock\nShelly Shock is. . .( ),0,0,0,0,0,0,1
4,154979,b734772b1a807e09,I do not care. Refer to Ong Teng Cheong talk p...,0,0,0,0,0,0,1


## Feature Extraction

In [5]:
def tokenize(s): 
    re_tok = re.compile('([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
    return re_tok.sub(r' \1 ', s).split()

# create bag of words representation, as a term document matrix using ngrams
def wordRepresentation(trainData, testData):
    # TF-IDF gives even better priors than the binarized features. 
    # it improves leaderboard score from 0.59 to 0.55.
    
    vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
                   min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
                   smooth_idf=1, sublinear_tf=1 )
    
      # Extracting features from the training data using a sparse vectorizer"
    train_term_doc = vec.fit_transform(trainData[COMMENT])

    # Extracting features from the test data using the same vectorizer
    test_term_doc = vec.transform(testData[COMMENT])

    # a sparse matrix with only a small number of non-zero elements with the below shape
    print("n_comments_train: %d, n_features: %d" % train_term_doc.shape)
    print("n_comments_test: %d, n_features: %d" % test_term_doc.shape)
    
    return train_term_doc, test_term_doc, vec
    
train_term_doc, test_term_doc, vec = wordRepresentation(train, test)

n_comments_train: 127656, n_features: 64631
n_comments_test: 31915, n_features: 64631


## Basic Naive Bayes Model

In [6]:
train_x = train_term_doc
test_x = test_term_doc

### Create the Model

In [6]:
model = nbsvm.train_model(train, train_x, label_cols, "baseline_nb_svm_2.pkl")

('fitting', 'toxic')
('fitting', 'severe_toxic')
('fitting', 'obscene')
('fitting', 'threat')
('fitting', 'insult')
('fitting', 'identity_hate')
{0: (matrix([[ 0.79319337,  1.10221879, -0.04566331, ...,  0.9486203 ,
          2.67195444,  1.66506689]]), LogisticRegression(C=4, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), 1: (matrix([[0.99188468, 3.45464817, 0.57473657, ..., 3.30104969, 4.23525492,
         4.01749628]]), LogisticRegression(C=4, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), 2: (matrix([[0.78087074, 2.01754999, 0.10412072, ..., 1.59251506, 2.52672029,
         2.30896164]]), LogisticRegres

### Predict from the created model, or load it from file

In [9]:
mdl = joblib.load("baseline_nb_svm.pkl")
preds = nbsvm.get_preds_from_model(mdl, test_x, label_cols)

## Save Results to a file

In [None]:
# create the submission file and store the results
def saveResults(predictions):
    submission = pd.DataFrame.from_dict({'id': test['id']})
    for idx, col in enumerate(label_cols):
        submission[col] = predictions[:,idx]
    submission.to_csv('submission_baseline_2.csv', index=False)
    
saveResults(preds)

## Calculate Log loss / AUC Score

In [None]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

try: 
    true = test
except NameError:
    true = pd.read_csv('../dataset/test_new.csv')
try: 
    y_pred = preds
except NameError:
    pred = pd.read_csv('submission_baseline.csv')
    y_pred = pred[list_classes].values

y_true = true[list_classes].values

loss = calc_log_loss(y_true, y_pred)
print ("Log Loss = ", loss)
auc = calc_auc_score(y_true, y_pred)
print ("AUC = ", auc)

## Demo

In [21]:
# for demo purposes only
def demo(vectorizer, label_cols, mdl):
    testing_comment = input("Enter a comment: ")

    # testing_comment = ["Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time."]

    # vectorizer.transform takes list input so pass a list
    user_comment = []
    user_comment.append(testing_comment)
    
    # Extracting features from the test data using the vectorizer
    test_data_x = vectorizer.transform(user_comment)
    
    # to store the predictions
    prediction = np.zeros((1, len(label_cols)))

    # make prediction using the model created
    for i, j in enumerate(label_cols):
        r = mdl[i][0]
        m = mdl[i][1]
        prediction[0,i] = m.predict_proba(test_data_x.multiply(r))[0,1]
    
    # copy the result and display
    pred_y = pd.DataFrame(columns=label_cols)
    for idx, col in enumerate(label_cols):
        pred_y.at[0, col] = prediction[0,idx]  
     
    print ("")
    print (pred_y)
#     # The String I used above is for the first comment, that is, result of loc 0 in submission file
#     print ("This is the actual result: ", submission.loc[0])

demo(vec, label_cols, mdl)      

Enter a comment: "jews are nice"

      toxic severe_toxic    obscene      threat     insult identity_hate
0  0.285182   0.00320251  0.0269435  0.00446865  0.0429508      0.858546
