In [28]:
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re, string
from sklearn.pipeline import make_union
from random import randint
import sys, os
sys.path.append(os.path.dirname(os.getcwd()))
from score import calc_auc_score, calc_log_loss
import nbsvm

## Input

In [29]:
# read the train and test data
def readInputFiles(train_file_path, test_file_path):
    train = pd.read_csv(train_file_path)
    test = pd.read_csv(test_file_path)
    return train, test
    
train, test = readInputFiles('../dataset/train_new.csv', '../dataset/test_new.csv')

## Preprocessing Data

In [30]:
def preProcessData(trainData, testData):
    # create a list of all the labels to predict
    label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    
    # create a 'none' label so we can see how many comments have no labels
    train['none'] = 1-train[label_cols].max(axis=1)
    
    # get rid of the empty comments, otherwise sklearn complains
    COMMENT = 'comment_text'
    train[COMMENT].fillna("unknown", inplace=True)
    test[COMMENT].fillna("unknown", inplace=True)
    return label_cols

label_cols = preProcessData(train, test)
train.describe()

Unnamed: 0.1,Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
count,127656.0,127656.0,127656.0,127656.0,127656.0,127656.0,127656.0,127656.0
mean,79621.481724,0.095867,0.00998,0.052751,0.003165,0.049062,0.008703,0.898313
std,46090.149802,0.29441,0.0994,0.223537,0.056167,0.215997,0.092884,0.302238
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,39654.75,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,79641.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,119511.5,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,159569.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Feature Extraction

In [39]:
def tokenize(s): 
    re_tok = re.compile('([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
    return re_tok.sub(r' \1 ', s).split()

# create bag of words representation, as a term document matrix using ngrams
def wordRepresentation(trainData, testData):
    # TF-IDF gives even better priors than the binarized features. 
    # it improves leaderboard score from 0.59 to 0.55.
    
    # Vectorizer using word ngram
    word_vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1, analyzer='word')

    # Vectorizer using char ngram 
    char_vec = TfidfVectorizer(ngram_range=(1,2),
                   min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
                   smooth_idf=1, sublinear_tf=1, analyzer='char')

    # use both char and word ngrams
    vec = make_union(word_vec, char_vec, n_jobs=2)

    COMMENT = 'comment_text'
        
    train_text = trainData[COMMENT]
    test_text = testData[COMMENT]

    all_text = pd.concat([train_text, test_text])
    vec.fit(all_text)
    
    # Extracting features from the training data using a sparse vectorizer"
    train_term_doc = vec.fit_transform(train_text)
    
    # Extracting features from the test data using the same vectorizer
    test_term_doc = vec.transform(test_text)


    # a sparse matrix with only a small number of non-zero elements with the below shape
    print("n_comments_train: %d, n_features: %d" % train_term_doc.shape)
    print("n_comments_test: %d, n_features: %d" % test_term_doc.shape)
    
    return train_term_doc, test_term_doc, vec
    
train_term_doc, test_term_doc, vec = wordRepresentation(train, test)

n_comments_train: 127656, n_features: 71274
n_comments_test: 31915, n_features: 71274


## Basic Naive Bayes Model

In [31]:
train_x = train_term_doc
test_x = test_term_doc
reload (nbsvm)

<module 'nbsvm' from 'nbsvm.pyc'>

### Create the Model

In [33]:
model = nbsvm.train_model(train, train_x, label_cols, "baseline_nb_svm_word_char_ngrams.pkl")

('fitting', 'toxic')
('fitting', 'severe_toxic')
('fitting', 'obscene')
('fitting', 'threat')
('fitting', 'insult')
('fitting', 'identity_hate')


### Predict from the created model, or load it from file

In [34]:
try:
    mdl = model   
except NameError: 
    mdl = joblib.load("baseline_nb_svm_word_char_ngrams.pkl")
preds = nbsvm.get_preds_from_model(mdl, test_x, label_cols)

## Save results to a file

In [22]:
# create the submission file and store the results
def saveResults():
    submission = pd.DataFrame.from_dict({'id': test['id']})
    for idx, col in enumerate(label_cols):
        submission[col] = predictions[:,idx]
    submission.to_csv('submission_word_char_ngram_new.csv', index=False)
    
saveResults()

### Calculate Log loss / AUC Score

In [37]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

try: 
    true = test
except NameError:
    true = pd.read_csv('../dataset/test_new.csv')
try: 
    y_pred = preds
except NameError:
    pred = pd.read_csv('submission_word_char_ngram_new.csv')
    y_pred = pred[list_classes].values

y_true = true[list_classes].values

loss = calc_log_loss(y_true, y_pred)
print ("Log Loss = ", loss)
auc = calc_auc_score(y_true, y_pred)
print ("AUC = ", auc)

('Log Loss = ', 0.06807726111810626)
('AUC = ', 0.9563577290265908)
