# Installing the needed libraries.

In [2]:
!pip install git+https://github.com/TeamHG-Memex/eli5@0.8.2

Collecting git+https://github.com/TeamHG-Memex/eli5@0.8.2
  Cloning https://github.com/TeamHG-Memex/eli5 (to revision 0.8.2) to c:\users\oleg\appdata\local\temp\pip-req-build-wdsfo5xq
Collecting typing (from eli5==0.8.2)
  Downloading https://files.pythonhosted.org/packages/05/d9/6eebe19d46bd05360c9a9aae822e67a80f9242aabbfc58b641b957546607/typing-3.7.4.3.tar.gz (78kB)
Building wheels for collected packages: eli5, typing
  Building wheel for eli5 (setup.py): started
  Building wheel for eli5 (setup.py): finished with status 'done'
  Created wheel for eli5: filename=eli5-0.8.2-py2.py3-none-any.whl size=94040 sha256=85cea1a523e42aa71518e1d1021d4aabc85c50be8cd9e95f6863e9d12ece740d
  Stored in directory: C:\Users\Oleg\AppData\Local\Temp\pip-ephem-wheel-cache-arblboar\wheels\d6\9e\ef\cf023b7bfc3f7a988a27ddefb645f3f33e6019be21f1c6af54
  Building wheel for typing (setup.py): started
  Building wheel for typing (setup.py): finished with status 'done'
  Created wheel for typing: filename=typing-

  Running command git clone -q https://github.com/TeamHG-Memex/eli5 'C:\Users\Oleg\AppData\Local\Temp\pip-req-build-wdsfo5xq'
  Running command git checkout -q 0b226d76bde4b71125a5ab1e701a8dd5522b2f86


In [3]:
import nltk
nltk.download(['stopwords', 'punkt', 'wordnet'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Oleg\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Oleg\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Oleg\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Loading needed libs

In [4]:
import pandas as pd
import numpy as np
from scipy import sparse

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

from tqdm import tqdm_notebook, tqdm

import os
import itertools

import warnings
warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer, FunctionTransformer

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.multiclass import OneVsRestClassifier

from sklearn.decomposition import IncrementalPCA as iPCA, TruncatedSVD

from sklearn.metrics import roc_auc_score, brier_score_loss, make_scorer
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_validate


from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import re
from nltk.corpus import stopwords
 
cachedStopWords = stopwords.words("english")

from collections import Counter
import string
from textblob import TextBlob, Word
from random import shuffle

from bs4 import BeautifulSoup
from eli5.lime import TextExplainer
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from tempfile import mkdtemp
from shutil import rmtree
from sklearn.externals.joblib import Memory

# Loading the data

Download https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews  
Files->Upload to session storage

In [6]:
data = pd.read_csv('IMDB Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
data.rename(columns={'sentiment': 'target'}, inplace=True)
data['target'].replace({'positive': 1, 'negative': 0}, inplace=True)
data.head()

Unnamed: 0,review,target
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [8]:
data['target'].value_counts()

1    25000
0    25000
Name: target, dtype: int64

# Feature extraction

In [9]:
def show_mean_feature_value_for_both(col, normalize_by_word_count=False):
    print('Average {} in positive review: {}'.format(col, data[data['target'] == 1][col].mean()))
    print('Average {} in negative review: {}'.format(col, data[data['target'] == 0][col].mean()))
    if normalize_by_word_count:
        print('Average ratio of {} in positive review: {}'.format(col, data[data['target'] == 1][col].mean() / data[data['target'] == 1]['word_count'].mean()))
        print('Average ratio of {} in positive review: {}'.format(col, data[data['target'] == 0][col].mean() / data[data['target'] == 0]['word_count'].mean()))

#### Number of words

Check if number of words in review can predict the grade

In [10]:
data['word_count'] = data['review'].apply(lambda x: len(str(x).split(" ")))
show_mean_feature_value_for_both('word_count')

Average word_count in positive review: 232.83776
Average word_count in negative review: 229.45412


For negative and positive comments number of words is almost identical

#### Average word length

In [11]:
def avg_word(review):
    words = review.split()
    return (sum(len(word) for word in words)/len(words))

data['avg_word'] = data['review'].apply(lambda x: avg_word(x))
show_mean_feature_value_for_both('avg_word')

Average avg_word in positive review: 4.657891605718888
Average avg_word in negative review: 4.62346073050112


There is no difference here

#### Number of stopwords

Before changing and removing the stopwords let's try to find some patterns with default list of stopwords from NLTK library

In [12]:
data['stopwords'] = data['review'].apply(lambda x: len([x for x in x.split() if x in cachedStopWords]))
show_mean_feature_value_for_both('stopwords', normalize_by_word_count=True)

Average stopwords in positive review: 95.5394
Average stopwords in negative review: 93.99816
Average ratio of stopwords in positive review: 0.41032605707940156
Average ratio of stopwords in positive review: 0.40965993550257457


And now there is nothing notable

#### Number of swear words

In [13]:
#collecting swear words (shifted to the right) and present them in two cases: low and with capital letter
swear_words=[                                                                                                                                                                                       'Bastard', 'Beaver', 'Bellend', 'Bloodclaat', 'Clunge', 'Cock', 'Dick', 'Dickhead', 'Fanny', 'Flaps', 'Gash', 'Knob', 'Minge', 'Prick', 'Punani', 'Pussy', 'Snatch', 'Twat', 'Cunt', 'Fuck', 'Motherfucker', 'Arsehole', 'Balls', 'Bint', 'Bitch', 'Bollocks', 'Bullshit', 'Feck', 'Munter', 'pissed off', 'Shit', 'Son of a bitch', 'Tits']
swear_words += [word.lower() for word in swear_words]

data['swear_words'] = data['review'].apply(lambda x: len([x for x in x.split() if x in swear_words]))
show_mean_feature_value_for_both('swear_words', normalize_by_word_count=True)

Average swear_words in positive review: 0.01476
Average swear_words in negative review: 0.01728
Average ratio of swear_words in positive review: 6.339177975256248e-05
Average ratio of swear_words in positive review: 7.53091729187517e-05


Both categories have similar frequencies (very small) of swear words

#### Number of numerics

In [14]:
data['numerics'] = data['review'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
show_mean_feature_value_for_both('numerics', normalize_by_word_count=True)

Average numerics in positive review: 0.52252
Average numerics in negative review: 0.61912
Average ratio of numerics in positive review: 0.002244137720617137
Average ratio of numerics in positive review: 0.0026982300426769412


Parts of numerics in reviews are small and that there is no visible differense betweed categories

#### Number of Uppercase words (CAPS)

In [15]:
data['upper'] = data['review'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
show_mean_feature_value_for_both('upper', normalize_by_word_count=True)

Average upper in positive review: 4.56928
Average upper in negative review: 5.14632
Average ratio of upper in positive review: 0.019624308359606275
Average ratio of upper in positive review: 0.02242853604023323


And now there is nothing suspicious

#### Number of punctuation marks

In [16]:
punctuation_marks = ['...', ',', '?', '!', ':', ';', '"', '\'', '-', '.', '–', '—']

data['punctuation_marks'] = data['review'].apply(lambda x: sum([1 for x in x if x in punctuation_marks]))
show_mean_feature_value_for_both('punctuation_marks', normalize_by_word_count=True)

Average punctuation_marks in positive review: 36.09288
Average punctuation_marks in negative review: 37.41992
Average ratio of punctuation_marks in positive review: 0.15501300132761972
Average ratio of punctuation_marks in positive review: 0.1630823626091351


Unfortunately this perspective assumption was not justified too

#### Difference between positive and negative smiles

In [17]:
def mood_counter(text):
    braces = 0
    
    for i in text:
    
        if i == ')':
            braces += 1
        
        elif i == '(':
            braces -= 1
    
    return braces

data['mood'] = data['review'].apply(lambda x: mood_counter(x))
show_mean_feature_value_for_both('mood')

Average mood in positive review: 0.02376
Average mood in negative review: 0.04356


Values are to small but negative reviews twice more positive than positive reviews)) So we will not delete smile or sad brackets 

#### Sentiment

In [18]:
%%time

data['sentiment'] = data['review'].apply(lambda x: TextBlob(x).sentiment[0])
show_mean_feature_value_for_both('sentiment')

Average sentiment in positive review: 0.19473096852005195
Average sentiment in negative review: 0.012063536307679767
Wall time: 1min 30s


There is notable difference between senses in these review categories

# Preprocessing

## Lower case

In [19]:
data['review'] = data['review'].apply(lambda x: " ".join(x.lower() for x in x.split()))

## Removing Punctuation

Do not forget that we found some relation between class and brackets

In [20]:
data['review'] = data['review'].str.replace('[^\w\s()]','')

## Removing common words

Let's find commonly occuring words which may not be in stopword list

In [21]:
pos_freq = pd.Series(' '.join(data[data['target'] == 1]['review']).split()).value_counts()
pos_freq[:25]

the      337527
and      173219
a        162460
of       151395
to       130530
is       111313
in        97710
it        75770
i         71050
this      68775
that      66282
br        55134
as        50168
with      45182
for       43697
was       43145
but       39654
film      39169
movie     35868
his       33469
on        32938
are       29104
you       29063
he        28232
not       27658
dtype: int64

In [22]:
neg_freq = pd.Series(' '.join(data[data['target'] == 0]['review']).split()).value_counts()
neg_freq[:25]

the      323438
a        156775
and      144352
of       136593
to       135970
is        98626
in        86167
this      80170
i         79532
it        76151
that      69273
br        58461
was       51989
movie     47194
for       42556
but       41671
with      41019
as        39707
film      34738
on        33484
not       31465
have      30516
you       30203
are       29001
be        28425
dtype: int64

First 25 words in both lists are quite similar and we can delete them. But what if not only first 25 are similar? There may be more

In [23]:
#merge lists of 25 common words
#common_words = list(pos_freq[:25].index) + list(neg_freq[:25].index)
common_words = list(pos_freq[:150].index) + list(neg_freq[:150].index)
#remove duplicate elements
common_words = list(set(common_words))
len(common_words)

168

Length 150 means that top-150 words in both categories are the same, length 300 means that top words are completely don't match. Length of the obtained merge list allow us to be sure that the most common words in both categories are almost the same. 

Deleting the most common words

In [24]:
data['review'] = data['review'].apply(lambda x: " ".join(x for x in x.split() if x not in common_words))

## Removing of Stop Words

In [25]:
data['review'] = data['review'].apply(lambda x: " ".join(x for x in x.split() if x not in cachedStopWords))

## Removing HTML markup and metadata

In [26]:
data['review'] = data['review'].apply(lambda x:  BeautifulSoup(x, 'html.parser').get_text())

## Splitting the data

The default split is 50/50 and now we wil make it 70/30. Don't forget to shuffle data

In [27]:
data.head()

Unnamed: 0,review,target,word_count,avg_word,stopwords,swear_words,numerics,upper,punctuation_marks,mood,sentiment
0,reviewers mentioned 1 oz episode youll hooked ...,1,307,4.739414,122,0,1,8,58,0,0.023433
1,wonderful production filming technique unassum...,1,162,5.166667,62,0,0,2,24,0,0.109722
2,thought wonderful spend hot summer weekend sit...,1,166,4.584337,70,0,0,3,24,0,0.354008
3,basically theres family boy (jake) thinks ther...,0,138,4.427536,58,0,1,3,19,0,-0.057813
4,petter matteis money visually stunning mr matt...,1,230,4.730435,92,0,0,1,32,0,0.217952


In [28]:
target = data['target']
data.drop(['target'], inplace=True, axis=1)

train_data, test_data, train_labels, test_labels = train_test_split(data, 
                                                                    target, 
                                                                    test_size=0.3, 
                                                                    stratify=target, 
                                                                    shuffle=True,
                                                                    random_state=42)

## Tokenization

In [29]:
def tokenize(text, stem_not_lem=True):
    min_length = 3
   
    tokenized_words = word_tokenize(text)

    if stem_not_lem:
        tokens = list(map(lambda token: PorterStemmer().stem(token), tokenized_words))
    else:
        tokens = list(map(lambda token: WordNetLemmatizer().lemmatize(token), tokenized_words))
    
    p = re.compile('[a-zA-Z]+');
    filtered_tokens = list(filter(lambda token:
                  p.match(token) and len(token)>=min_length,
         tokens)); # check if numbers are needed

    return filtered_tokens

    
def vectorize_with_tf_idf(train_data, tokenizer, test_data=None, max_feats=50000):
    tfidf = TfidfVectorizer(tokenizer=tokenizer, 
                            #min_df=3,
                            #max_df=0.90, 
                            max_features=max_feats,
                            use_idf=True, 
                            sublinear_tf=True,
                            norm='l2',
                            ngram_range = (1,3));
    
    vectorised_train_data = tfidf.fit_transform(train_data)
    if test_data is None:
        return vectorised_train_data, tfidf
    else:
        vectorised_test_data = tfidf.transform(test_data)
        return vectorised_train_data, vectorised_test_data, tfidf

## Vectorizing

Let's start from 3000 features in tf-idf. Later we will check other values

In [None]:
%%time

# Learn and transform train documents
vectorised_train_data, tfidf = vectorize_with_tf_idf(train_data=train_data['review'], 
                                                                           tokenizer=tokenize,
                                                                           max_feats=3000)

print('there are ', vectorised_train_data.shape[0], ' samples in the train')
num_feats = vectorised_train_data.shape[1]
print('there are ', num_feats, ' features in the dataset')

there are  35000  samples in the train
there are  3000  features in the dataset
CPU times: user 3min 25s, sys: 1.29 s, total: 3min 26s
Wall time: 3min 26s


In [None]:
train_features = sparse.hstack((vectorised_train_data, train_data.drop(['review'], axis=1).to_numpy()))
train_features.shape

((35000, 3009), (15000, 3009))

# Modeling and scoring

## SGD classifier

Now we are finding best parameters for SGD classifier

In [None]:
parameters = {'loss': ['log', 'modified_huber'], 
              'alpha': [1e-5, 1e-4, 5e-4, 1e-3, 1e-2]}
grid_clf = GridSearchCV(SGDClassifier(), parameters, scoring='roc_auc', n_jobs=-1,
                   cv=5, verbose=1)

grid_clf.fit(train_features, train_labels)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.4min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=SGDClassifier(alpha=0.0001, average=False,
                                     class_weight=None, early_stopping=False,
                                     epsilon=0.1, eta0=0.0, fit_intercept=True,
                                     l1_ratio=0.15, learning_rate='optimal',
                                     loss='hinge', max_iter=1000,
                                     n_iter_no_change=5, n_jobs=None,
                                     penalty='l2', power_t=0.5,
                                     random_state=None, shuffle=True, tol=0.001,
                                     validation_fraction=0.1, verbose=0,
                                     warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'alpha': [1e-05, 0.0001, 0.0005, 0.001, 0.01],
                         'loss': ['log', 'modified_huber']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
  

In [None]:
pd.DataFrame(grid_clf.cv_results_).sort_values('rank_test_score')[['params', 'mean_test_score', 'mean_fit_time']]

Unnamed: 0,params,mean_test_score,mean_fit_time
2,"{'alpha': 0.0001, 'loss': 'log'}",0.819995,3.56315
5,"{'alpha': 0.0005, 'loss': 'modified_huber'}",0.79586,2.957689
6,"{'alpha': 0.001, 'loss': 'log'}",0.786721,4.62388
7,"{'alpha': 0.001, 'loss': 'modified_huber'}",0.773994,3.169701
9,"{'alpha': 0.01, 'loss': 'modified_huber'}",0.773727,2.907726
8,"{'alpha': 0.01, 'loss': 'log'}",0.761135,3.866019
0,"{'alpha': 1e-05, 'loss': 'log'}",0.754164,3.843386
1,"{'alpha': 1e-05, 'loss': 'modified_huber'}",0.739384,2.605306
4,"{'alpha': 0.0005, 'loss': 'log'}",0.731478,3.274979
3,"{'alpha': 0.0001, 'loss': 'modified_huber'}",0.713987,2.544674


The best model have auc roc score near 0,82

### Finding best shape of tf-idf matrix

Let's make an experiment in which we will find the best shape of tf-idf matrix on the example of the best SGD classifier. The experiment can't guarantee that all the other models will show the same distribution over the considered parameter values but checking all possible feature numbers for all models is very time-consuming

In [None]:
%%time

num_features = [1000, 2000, 3000, 5000, 7000, 10000, 15000, 20000]

for feats in num_features:
    print('Num feats:', feats)
    # Learn and transform train documents
    vectorised_train_data, tfidf = vectorize_with_tf_idf(train_data=train_data['review'], 
                                                        tokenizer=tokenize,
                                                        max_feats=3000)

    train_features = sparse.hstack((vectorised_train_data, train_data.drop(['review'], axis=1).to_numpy()))

    clf = SGDClassifier(loss='log', alpha=0.0001)
    score = cross_val_score(estimator=clf, X=train_features, y=train_labels, 
                            scoring='roc_auc', cv=5, n_jobs=-1, verbose=10)
    print('score:', score, 'mean:', score.mean(), '\n')

Num feats: 1000


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   10.2s remaining:    6.8s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   13.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   13.5s finished


score: [0.83168229 0.7590698  0.76264163 0.74945543 0.80676212] mean: 0.7819222530612245 

Num feats: 2000


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    6.8s remaining:    4.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   10.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   10.4s finished


score: [0.74099037 0.82909151 0.69466931 0.78513216 0.83379453] mean: 0.7767355755102041 

Num feats: 3000


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    6.4s remaining:    4.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    9.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    9.5s finished


score: [0.83289208 0.73151478 0.66328367 0.79500049 0.63923469] mean: 0.7323851428571428 

Num feats: 5000


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    6.2s remaining:    4.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   10.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   10.3s finished


score: [0.66791935 0.62824261 0.84889527 0.63117608 0.76834351] mean: 0.7089153632653062 

Num feats: 7000


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    7.3s remaining:    4.9s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   11.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   11.8s finished


score: [0.70397494 0.78175273 0.71943771 0.77683224 0.85722531] mean: 0.767844587755102 

Num feats: 10000


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    5.4s remaining:    3.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    9.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    9.3s finished


score: [0.84700612 0.85344155 0.85600392 0.61560465 0.69673396] mean: 0.7737580408163265 

Num feats: 15000


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    8.3s remaining:    5.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   12.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   12.3s finished


score: [0.8647418  0.76899649 0.85778359 0.83966653 0.86082384] mean: 0.8384024489795918 

Num feats: 20000


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    6.5s remaining:    4.3s


score: [0.67410661 0.80934008 0.54406376 0.86169943 0.85703682] mean: 0.7492493387755103 

CPU times: user 20min 12s, sys: 5.16 s, total: 20min 18s
Wall time: 21min 47s


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   11.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   11.4s finished


With an increase of number of features model becomes better, but after 15000 features profit is very small. So let's deal with 15000 features

In [30]:
%%time 

# Learn and transform train documents
vectorised_train_data, vectorised_test_data, tfidf = vectorize_with_tf_idf(train_data=train_data['review'], 
                                                                           tokenizer=tokenize, 
                                                                           test_data=test_data['review'],
                                                                           max_feats=15000)

train_features = sparse.hstack((vectorised_train_data, train_data.drop(['review'], axis=1).to_numpy()))
test_features = sparse.hstack((vectorised_test_data, test_data.drop(['review'], axis=1).to_numpy()))

Wall time: 3min 9s


## Support Vector Classification (SVC)

In [None]:
cross_val_score(estimator=SVC(), X=train_features, y=train_labels,
                scoring='roc_auc', cv=3, n_jobs=-1, verbose=10)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed: 31.0min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed: 31.0min finished


array([0.56130676, 0.56464014, 0.5632183 ])

Computational time of this method is very long and the results are worse than SGD model gave

## Naive Bayes classification

In [58]:
parameters = {'alpha': [1e-5, 1e-4, 5e-4, 1e-3, 1e-2, 1e-1]}
grid_clf = GridSearchCV(BernoulliNB(), parameters, scoring='roc_auc', n_jobs=-1,
                   cv=5, verbose=1)

grid_clf.fit(train_features, train_labels)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    1.7s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None,
                                   fit_prior=True),
             iid='deprecated', n_jobs=-1,
             param_grid={'alpha': [1e-05, 0.0001, 0.0005, 0.001, 0.01, 0.1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='roc_auc', verbose=1)

In [59]:
pd.DataFrame(grid_clf.cv_results_).sort_values('rank_test_score')[['params', 'mean_test_score', 'mean_fit_time']]

Unnamed: 0,params,mean_test_score,mean_fit_time
5,{'alpha': 0.1},0.925376,0.078896
4,{'alpha': 0.01},0.925127,0.085312
3,{'alpha': 0.001},0.924686,0.076159
2,{'alpha': 0.0005},0.924516,0.078539
1,{'alpha': 0.0001},0.924127,0.078545
0,{'alpha': 1e-05},0.923534,0.090075


Multinomial Naive Bayes have auc roc score about 0.92, that is better than the  score of SGD model

### Calibration

In [49]:
calib_tr_data, calib_val_data, calib_tr_labels, calib_val_labels = train_test_split(
    train_features, 
    train_labels, 
    test_size=0.2, 
    stratify=train_labels, 
    shuffle=True,
    random_state=42)

def clf_fit_show_auc(clf, X_train, y_train, X_test, y_test):
    #classifier = OneVsRestClassifier(clf)
    clf.fit(X_train, y_train)

    prob_pred = clf.predict_proba(X_test)[:,1]
    print('auc roc score : ', roc_auc_score(y_test, prob_pred))

In [50]:
from sklearn.calibration import CalibratedClassifierCV

clf = CalibratedClassifierCV(BernoulliNB(), 
                                    cv=5, 
                                    method='isotonic')


clf_fit_show_auc(clf, calib_tr_data, calib_tr_labels, calib_val_data, calib_val_labels) 

auc roc score :  0.9261577551020407


Almost the same score

## TruncatedSVD

Now let's try to reduce dimension of tf-idf sparse matrix

In [51]:
tsvd = TruncatedSVD(n_components = 1000)

Let's make a pipeline to make data preprocessing easier

In [56]:
preprocessing = Pipeline(steps = [('tsvd', tsvd)])

We will estimate the efficiency of the decomposition on the example of Naive Bayes classifier with the best parameters we found above

In [68]:
%%time

clf = BernoulliNB(alpha=0.1)
tsvd_clf_pipe = Pipeline(steps=[('preprocessing', preprocessing), ('clf', clf)])

scores  = cross_val_score(estimator=tsvd_clf_pipe, X=train_features, y=train_labels,
                scoring='roc_auc', cv=5, n_jobs=-1, verbose=10)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  3.6min remaining:  2.4min


CPU times: user 239 ms, sys: 80.2 ms, total: 320 ms
Wall time: 4min 34s


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  4.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  4.6min finished


In [70]:
scores

array([0.87044139, 0.86681535, 0.88100669, 0.87774343, 0.87346212])

Reducing the dimension of the matrix didn't make a profit but take more time than fitting without SVD

# Final pipeline

In [33]:
do_nothing_tf = FunctionTransformer(lambda x: x)

In [34]:
from sklearn.compose import make_column_transformer

In [62]:
tfidf

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=15000,
                min_df=1, ngram_range=(1, 3), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=True, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function tokenize at 0x00000249FB07CBF8>,
                use_idf=True, vocabulary=None)

In [36]:
pipeline = Pipeline([
    ('transformer',  make_column_transformer((tfidf, 'review'),
                                             remainder='passthrough' # the same as(do_nothing_tf, [item for item in list(train_data.columns) if item!='review']),
                                            )
    ),
    ('clf', BernoulliNB(alpha=0.1)),
])

In [55]:
test_data.shape, type(test_data)

((15000, 10), pandas.core.frame.DataFrame)

In [37]:
%%time 

pipeline.fit(train_data, train_labels)
preds = pipeline.predict_proba(test_data)
score = roc_auc_score(test_labels, preds[:, 1])
print(score)

0.9239850933333333
Wall time: 3min 11s


# Visualization
Let's try to visualise the text part of the best pipeline

In [61]:
TextExpl = TextExplainer()

In [63]:
tfidf

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=15000,
                min_df=1, ngram_range=(1, 3), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=True, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function tokenize at 0x00000249FB07CBF8>,
                use_idf=True, vocabulary=None)

In [78]:
feature_names_tfidf = pipeline.named_steps['preprocessing'].named_steps['tfidf'].get_feature_names()
feature_names_tfidf[:10]

['aamir',
 'aaron',
 'abandon',
 'abba',
 'abbey',
 'abbi',
 'abbot',
 'abbott',
 'abbott costello',
 'abc']

In [71]:
preprocessing = Pipeline(steps=[('tfidf', tfidf)])

pipeline = Pipeline(steps=[('preprocessing', preprocessing), 
                           ('clf', BernoulliNB(alpha=0.1))])

In [68]:
train_text_data, test_text_data = train_data['review'], test_data['review']

In [72]:
%%time 

pipeline.fit(train_text_data, train_labels)
preds = pipeline.predict_proba(test_text_data)
score = roc_auc_score(test_labels, preds[:, 1])
print(score)

0.9209163822222224
Wall time: 3min 10s


#### Let's find some presentative positive text

In [90]:
for i in tqdm(range(0, test_text_data.shape[0])):
    if (preds[i, 1] > 0.97) and i!=6: # i=6 is not interesting
        positive_index = i
        print('index of very positive sentense:', positive_index)
        break

pos_text = test_text_data.values[positive_index]
        
print('Real class:', list(test_labels)[positive_index])
print('Predicted:', preds[positive_index, :])

  0%|          | 0/15000 [00:00<?, ?it/s]

index of very positive sentense: 8





Real class: 1
Predicted: [1.22074607e-07 9.99999878e-01]


In [91]:
%%time

TextExpl.fit(pos_text, pipeline.predict_proba)
display(TextExpl.show_prediction(target_names=feature_names_tfidf))

Contribution?,Feature
14.182,Highlighted in text (sum)
-0.144,<BIAS>


Wall time: 13.7 s


Here we can see such words as favorite, winner, award, loving etc. These words are charachterize this review as positive with probability of 0.97

#### Let's find some negative text

In [97]:
for i in tqdm(range(0, test_text_data.shape[0])):
    if (preds[i, 0] > 0.97) and i!=0: #i=0 is very short
        negative_index = i
        print('index of very negative sentense:', negative_index)
        break

neg_text = test_text_data.values[negative_index]
        
print('Real class:', list(test_labels)[negative_index])
print('Predicted:', preds[negative_index, :])

  0%|          | 0/15000 [00:00<?, ?it/s]

index of very negative sentense: 2





Real class: 0
Predicted: [9.99989927e-01 1.00727274e-05]


In [98]:
%%time

TextExpl.fit(neg_text, pipeline.predict_proba)
display(TextExpl.show_prediction(target_names=feature_names_tfidf))

Contribution?,Feature
10.501,Highlighted in text (sum)
0.108,<BIAS>


Wall time: 13.6 s


And here we see the words terribly, boring, endless, obvious.