In [5]:
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re, string
from sklearn.pipeline import make_union
from random import randint

## Input

In [2]:
# read the train and test data
def readInputFiles(train_file_path, test_file_path):
    train = pd.read_csv(train_file_path)
    test = pd.read_csv(test_file_path)
    return train, test
    
train, test = readInputFiles('../input/train.csv', '../input/test.csv')

## Preprocessing Data

In [3]:
def preProcessData(trainData, testData):
    # create a list of all the labels to predict
    label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    
    # create a 'none' label so we can see how many comments have no labels
    train['none'] = 1-train[label_cols].max(axis=1)
    
    # get rid of the empty comments, otherwise sklearn complains
    COMMENT = 'comment_text'
    train[COMMENT].fillna("unknown", inplace=True)
    test[COMMENT].fillna("unknown", inplace=True)
    return label_cols

label_cols = preProcessData(train, test)
train.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805,0.898321
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342,0.302226
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Feature Extraction

In [7]:
def tokenize(s): 
    re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
    return re_tok.sub(r' \1 ', s).split()

# create bag of words representation, as a term document matrix using ngrams
def wordRepresentation(trainData, testData):
    # TF-IDF gives even better priors than the binarized features. 
    # it improves leaderboard score from 0.59 to 0.55.
    
    # Vectorizer using word ngram
    word_vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1, analyzer='word')

    # Vectorizer using char ngram 
    char_vec = TfidfVectorizer(ngram_range=(1,2),
                   min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
                   smooth_idf=1, sublinear_tf=1, analyzer='char')

    # use both char and word ngrams
    vec = make_union(word_vec, char_vec, n_jobs=2)

    COMMENT = 'comment_text'
        
    train_text = trainData[COMMENT]
    test_text = testData[COMMENT]

    all_text = pd.concat([train_text, test_text])
    vec.fit(all_text)
    
    # Extracting features from the training data using a sparse vectorizer"
    train_term_doc = vec.fit_transform(train_text)
    
    # Extracting features from the test data using the same vectorizer
    test_term_doc = vec.transform(test_text)


    # a sparse matrix with only a small number of non-zero elements with the below shape
    print("n_comments_train: %d, n_features: %d" % train_term_doc.shape)
    print("n_comments_test: %d, n_features: %d" % test_term_doc.shape)
    
    return train_term_doc, test_term_doc, vec
    
train_term_doc, test_term_doc, vec = wordRepresentation(train, test)

n_comments_train: 159571, n_features: 432572
n_comments_test: 153164, n_features: 432572


## Basic Naive Bayes Model

In [8]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [9]:
x = train_term_doc
test_x = test_term_doc

In [10]:
def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4, dual=True)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [11]:
# fit model and make predictions
def getPredictions(test, label_cols):
    
    preds = np.zeros((len(test), len(label_cols)))

    for i, j in enumerate(label_cols):
        print('fitting', j)
        m,r = get_mdl(train[j])
        
        # get predictions
        preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]

    return preds
    
predictions = getPredictions(test, label_cols)

fitting toxic
fitting severe_toxic
fitting obscene
fitting threat
fitting insult
fitting identity_hate


## Save results to a file

In [12]:
# create the submission file and store the results
def saveResults():
    submission = pd.DataFrame.from_dict({'id': test['id']})
    for idx, col in enumerate(label_cols):
        submission[col] = predictions[:,idx]
    submission.to_csv('submission.csv', index=False)
    
saveResults()