### https://www.kaggle.com/abhishek/approaching-almost-any-nlp-problem-on-kaggle/notebook

In [85]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

In [4]:
train = pd.read_csv('NLP_author/train.csv')
test = pd.read_csv('NLP_author/test.csv')

In [5]:
train.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [13]:
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(train.author.values)
y, len(y)

(array([0, 1, 0, ..., 0, 0, 1], dtype=int64), 19579)

In [28]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train.text.values, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1)
type(xtrain), len(xtrain), len(ytrain), len(xvalid), len(yvalid)

(numpy.ndarray, 17621, 17621, 1958, 1958)

In [30]:
print (xtrain.shape)
print (xvalid.shape)

(17621,)
(1958,)


In [33]:
# Always start with these features. They work (almost) everytime!
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [52]:
type(tfv.get_feature_names), type(tfv.vocabulary_), len(tfv.vocabulary_)

(method, dict, 15102)

In [42]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [55]:
# Fitting a simple Logistic Regression on TFIDF
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)

# type(predictions), predictions, len(predictions)
# xvalid_tfv  # 1958x15102 sparse matrix of type '<class 'numpy.float64'>'

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.626 


In [80]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(list(xtrain) + list(xvalid))
xtrain_ctv =  ctv.transform(xtrain) 
xvalid_ctv = ctv.transform(xvalid)

In [81]:
xtrain_ctv.shape, xtrain_ctv[0:100].toarray(), ctv.vocabulary_
# The shape depends on the ngram_range

((17621, 400266), array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64), {'hair': 153806,
  'brightest': 38996,
  'living': 203649,
  'gold': 146831,
  'despite': 84859,
  'poverty': 266371,
  'clothing': 56419,
  'set': 312719,
  'crown': 71852,
  'distinction': 92662,
  'head': 158208,
  'hair brightest': 153829,
  'brightest living': 39003,
  'living gold': 203713,
  'gold despite': 146842,
  'despite poverty': 84925,
  'poverty clothing': 266382,
  'clothing set': 56435,
  'set crown': 312787,
  'crown distinction': 71857,
  'distinction head': 92677,
  'hair brightest living': 153830,
  'brightest living gold': 39004,
  'living gold despite': 203714,
  'gold despite poverty': 146843,
  'despite poverty clothing': 84926,
  'poverty clothing set': 266383,
  'clothing set crown': 56436,
  'set crown distinction': 312

## Logistic Regression on Counts

In [82]:
# Fitting a simple Logistic Regression on Counts
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)

type(xtrain_ctv), xtrain_ctv

(scipy.sparse.csr.csr_matrix,
 <17621x400266 sparse matrix of type '<class 'numpy.int64'>'
 	with 556265 stored elements in Compressed Sparse Row format>)

In [84]:
print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))
# Removing ngram_range=(1, 3) leads to a log loss of 0.498

logloss: 0.528 


## Multinomial Naive Bayes

In [87]:
# Fitting a simple Naive Bayes on TFIDF
clf = MultinomialNB()
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.578 


In [88]:
# Fitting a simple Naive Bayes on Counts
clf = MultinomialNB()
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.485 


In [91]:
ytrain, predictions

(array([2, 0, 2, ..., 0, 2, 2], dtype=int64),
 array([[9.99977284e-01, 6.20152092e-07, 2.20954082e-05],
        [8.63767993e-01, 1.76937243e-02, 1.18538282e-01],
        [9.14717223e-01, 1.30711613e-03, 8.39756610e-02],
        ...,
        [4.85386704e-02, 1.91127593e-02, 9.32348570e-01],
        [3.41833956e-02, 6.30451245e-02, 9.02771480e-01],
        [1.51720630e-10, 9.99999961e-01, 3.90218966e-08]]))