# Installing the needed libraries.

In [1]:
!pip install eli5

Collecting eli5
[?25l  Downloading https://files.pythonhosted.org/packages/97/2f/c85c7d8f8548e460829971785347e14e45fa5c6617da374711dec8cb38cc/eli5-0.10.1-py2.py3-none-any.whl (105kB)
[K     |███                             | 10kB 18.5MB/s eta 0:00:01[K     |██████▏                         | 20kB 2.2MB/s eta 0:00:01[K     |█████████▎                      | 30kB 2.9MB/s eta 0:00:01[K     |████████████▍                   | 40kB 3.2MB/s eta 0:00:01[K     |███████████████▌                | 51kB 2.5MB/s eta 0:00:01[K     |██████████████████▋             | 61kB 2.9MB/s eta 0:00:01[K     |█████████████████████▊          | 71kB 3.1MB/s eta 0:00:01[K     |████████████████████████▊       | 81kB 3.5MB/s eta 0:00:01[K     |███████████████████████████▉    | 92kB 3.6MB/s eta 0:00:01[K     |███████████████████████████████ | 102kB 3.5MB/s eta 0:00:01[K     |████████████████████████████████| 112kB 3.5MB/s 
Installing collected packages: eli5
Successfully installed eli5-0.10.1


In [112]:
# download stopwords, punkt, wordnet
import nltk
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> wordnet
    Downloading package wordnet to /root/nltk_data...
      Unzipping corpora/wordnet.zip.

---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


True

# Loading needed libs

In [3]:
import pandas as pd
import numpy as np
from scipy import sparse

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

from tqdm import tqdm_notebook

import os
import itertools

import warnings
warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.multiclass import OneVsRestClassifier

from sklearn.decomposition import IncrementalPCA as iPCA, TruncatedSVD

from sklearn.metrics import roc_auc_score, brier_score_loss, make_scorer
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.model_selection import train_test_split, GridSearchCV

  import pandas.util.testing as tm


In [8]:
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import re
from nltk.corpus import stopwords
 
cachedStopWords = stopwords.words("english")

from collections import Counter
import string
from textblob import TextBlob, Word
from random import shuffle

from bs4 import BeautifulSoup
from eli5.lime import TextExplainer
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from tempfile import mkdtemp
from shutil import rmtree
from sklearn.externals.joblib import Memory

Using TensorFlow backend.


# Loading the data

Download https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews  
Files->Upload to session storage

In [75]:
data = pd.read_csv('IMDB Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [76]:
data.rename(columns={'sentiment': 'target'}, inplace=True)
data['target'].replace({'positive': 1, 'negative': 0}, inplace=True)
data.head()

Unnamed: 0,review,target
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [77]:
data['target'].value_counts()

1    25000
0    25000
Name: target, dtype: int64

# Feature extraction

In [78]:
def show_mean_feature_value_for_both(col, normalize_by_word_count=False):
    print('Average {} in positive review: {}'.format(col, data[data['target'] == 1][col].mean()))
    print('Average {} in negative review: {}'.format(col, data[data['target'] == 0][col].mean()))
    if normalize_by_word_count:
        print('Average ratio of {} in positive review: {}'.format(col, data[data['target'] == 1][col].mean() / data[data['target'] == 1]['word_count'].mean()))
        print('Average ratio of {} in positive review: {}'.format(col, data[data['target'] == 0][col].mean() / data[data['target'] == 0]['word_count'].mean()))

#### Number of words

Check if number of words in review can predict the grade

In [79]:
data['word_count'] = data['review'].apply(lambda x: len(str(x).split(" ")))
show_mean_feature_value_for_both('word_count')

Average word_count in positive review: 232.83776
Average word_count in negative review: 229.45412


For negative and positive comments number of words is almost identical

#### Average word length

In [80]:
def avg_word(review):
    words = review.split()
    return (sum(len(word) for word in words)/len(words))

data['avg_word'] = data['review'].apply(lambda x: avg_word(x))
show_mean_feature_value_for_both('avg_word')

Average avg_word in positive review: 4.657891605718916
Average avg_word in negative review: 4.623460730501129


There is no difference here

#### Number of stopwords

Before changing and removing the stopwords let's try to find some patterns with default list of stopwords from NLTK library

In [81]:
data['stopwords'] = data['review'].apply(lambda x: len([x for x in x.split() if x in cachedStopWords]))
show_mean_feature_value_for_both('stopwords', normalize_by_word_count=True)

Average stopwords in positive review: 95.5394
Average stopwords in negative review: 93.99816
Average ratio of stopwords in positive review: 0.41032605707940156
Average ratio of stopwords in positive review: 0.40965993550257457


And now there is nothing notable

#### Number of swear words

In [82]:
#collecting swear words (shifted to the right) and present them in two cases: low and with capital letter
swear_words=[                                                                                                                                                                                       'Bastard', 'Beaver', 'Bellend', 'Bloodclaat', 'Clunge', 'Cock', 'Dick', 'Dickhead', 'Fanny', 'Flaps', 'Gash', 'Knob', 'Minge', 'Prick', 'Punani', 'Pussy', 'Snatch', 'Twat', 'Cunt', 'Fuck', 'Motherfucker', 'Arsehole', 'Balls', 'Bint', 'Bitch', 'Bollocks', 'Bullshit', 'Feck', 'Munter', 'pissed off', 'Shit', 'Son of a bitch', 'Tits']
swear_words += [word.lower() for word in swear_words]

data['swear_words'] = data['review'].apply(lambda x: len([x for x in x.split() if x in swear_words]))
show_mean_feature_value_for_both('swear_words', normalize_by_word_count=True)

Average swear_words in positive review: 0.01476
Average swear_words in negative review: 0.01728
Average ratio of swear_words in positive review: 6.339177975256248e-05
Average ratio of swear_words in positive review: 7.53091729187517e-05


Both categories have similar frequencies (very small) of swear words

#### Number of numerics

In [83]:
data['numerics'] = data['review'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
show_mean_feature_value_for_both('numerics', normalize_by_word_count=True)

Average numerics in positive review: 0.52252
Average numerics in negative review: 0.61912
Average ratio of numerics in positive review: 0.002244137720617137
Average ratio of numerics in positive review: 0.0026982300426769412


Parts of numerics in reviews are small and that there is no visible differense betweed categories

#### Number of Uppercase words (CAPS)

In [84]:
data['upper'] = data['review'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
show_mean_feature_value_for_both('upper', normalize_by_word_count=True)

Average upper in positive review: 4.56928
Average upper in negative review: 5.14632
Average ratio of upper in positive review: 0.019624308359606275
Average ratio of upper in positive review: 0.02242853604023323


And now there is nothing suspicious

#### Number of punctuation marks

In [85]:
punctuation_marks = ['...', ',', '?', '!', ':', ';', '"', '\'', '-', '.', '–', '—']

data['punctuation_marks'] = data['review'].apply(lambda x: sum([1 for x in x if x in punctuation_marks]))
show_mean_feature_value_for_both('punctuation_marks', normalize_by_word_count=True)

Average punctuation_marks in positive review: 36.09288
Average punctuation_marks in negative review: 37.41992
Average ratio of punctuation_marks in positive review: 0.15501300132761972
Average ratio of punctuation_marks in positive review: 0.1630823626091351


Unfortunately this perspective assumption was not justified too

#### Difference between positive and negative smiles

In [86]:
def mood_counter(text):
    braces = 0
    
    for i in text:
    
        if i == ')':
            braces += 1
        
        elif i == '(':
            braces -= 1
    
    return braces

data['mood'] = data['review'].apply(lambda x: mood_counter(x))
show_mean_feature_value_for_both('mood')

Average mood in positive review: 0.02376
Average mood in negative review: 0.04356


Values are to small but negative reviews twice more positive than positive reviews)) So we will not delete smile or sad brackets 

#### Sentiment

In [87]:
%%time

data['sentiment'] = data['review'].apply(lambda x: TextBlob(x).sentiment[0])
show_mean_feature_value_for_both('sentiment')

Average sentiment in positive review: 0.19473096852005084
Average sentiment in negative review: 0.012063536307679774
CPU times: user 1min 28s, sys: 9.81 ms, total: 1min 28s
Wall time: 1min 28s


There is notable difference between senses in these review categories

# Preprocessing

## Lower case

In [88]:
data['review'] = data['review'].apply(lambda x: " ".join(x.lower() for x in x.split()))

## Removing Punctuation

Do not forget that we found some relation between class and brackets

In [89]:
data['review'] = data['review'].str.replace('[^\w\s()]','')

## Removing common words

Let's find commonly occuring words which may not be in stopword list

In [92]:
pos_freq = pd.Series(' '.join(data[data['target'] == 1]['review']).split()).value_counts()
pos_freq[:25]

the      337527
and      173219
a        162460
of       151395
to       130530
is       111313
in        97710
it        75770
i         71050
this      68775
that      66282
br        55134
as        50168
with      45182
for       43697
was       43145
but       39654
film      39169
movie     35868
his       33469
on        32938
are       29104
you       29063
he        28232
not       27658
dtype: int64

In [93]:
neg_freq = pd.Series(' '.join(data[data['target'] == 0]['review']).split()).value_counts()
neg_freq[:25]

the      323438
a        156775
and      144352
of       136593
to       135970
is        98626
in        86167
this      80170
i         79532
it        76151
that      69273
br        58461
was       51989
movie     47194
for       42556
but       41671
with      41019
as        39707
film      34738
on        33484
not       31465
have      30516
you       30203
are       29001
be        28425
dtype: int64

First 25 words in both lists are quite similar and we can delete them. But what if not only first 25 are similar? There may be more

In [94]:
#merge lists of 25 common words
#common_words = list(pos_freq[:25].index) + list(neg_freq[:25].index)
common_words = list(pos_freq[:150].index) + list(neg_freq[:150].index)
#remove duplicate elements
common_words = list(set(common_words))
len(common_words)

168

Length 150 means that top-150 words in both categories are the same, length 300 means that top words are completely don't match. Length of the obtained merge list allow us to be sure that the most common words in both categories are almost the same. 

Deleting the most common words

In [95]:
data['review'] = data['review'].apply(lambda x: " ".join(x for x in x.split() if x not in common_words))

## Removing of Stop Words

In [96]:
data['review'] = data['review'].apply(lambda x: " ".join(x for x in x.split() if x not in cachedStopWords))

## Removing HTML markup and metadata

In [97]:
data['review'] = data['review'].apply(lambda x:  BeautifulSoup(x, 'html.parser').get_text())

## Tokenization

In [120]:
def tokenize(text, stem_not_lem=True):
    min_length = 3
   
    tokenized_words = word_tokenize(text)

    if stem_not_lem:
        tokens = list(map(lambda token: PorterStemmer().stem(token), tokenized_words))
    else:
        tokens = list(map(lambda token: WordNetLemmatizer().lemmatize(token), tokenized_words))
    
    p = re.compile('[a-zA-Z]+');
    filtered_tokens = list(filter(lambda token:
                  p.match(token) and len(token)>=min_length,
         tokens)); # check if numbers are needed

    return filtered_tokens

    
def tf_idf(train_data, tokenizer, test_data=None, max_feats=50000):
    tfidf = TfidfVectorizer(tokenizer=tokenizer, 
                            min_df=3,
                            max_df=0.90, 
                            max_features=max_feats,
                            use_idf=True, 
                            sublinear_tf=True,
                            norm='l2',
                            ngram_range = (1,3));
    
    vectorised_train_data = tfidf.fit_transform(train_data)
    if test_data==None:
        return vectorised_train_data, tfidf
    else:
        vectorised_test_data = tfidf.transform(test_data)
        return vectorised_train_data, vectorised_test_data, tfidf

#### Splitting the data

The default split is 50/50 and now we wil make it 70/30. Don't forget to shuffle data

In [122]:
data.head()

Unnamed: 0,review,target,word_count,avg_word,stopwords,swear_words,numerics,upper,punctuation_marks,mood,sentiment
0,reviewers mentioned 1 oz episode youll hooked ...,1,307,4.739414,122,0,1,8,58,0,0.023433
1,wonderful production filming technique unassum...,1,162,5.166667,62,0,0,2,24,0,0.109722
2,thought wonderful spend hot summer weekend sit...,1,166,4.584337,70,0,0,3,24,0,0.354008
3,basically theres family boy (jake) thinks ther...,0,138,4.427536,58,0,1,3,19,0,-0.057813
4,petter matteis money visually stunning mr matt...,1,230,4.730435,92,0,0,1,32,0,0.217952


In [124]:
target = data['target']
data.drop(['target'], inplace=True, axis=1)

In [127]:
train_data, test_data, train_labels, test_labels = train_test_split(data, 
                                                                    target, 
                                                                    test_size=0.3, 
                                                                    stratify=target, 
                                                                    shuffle=True,
                                                                    random_state=42)

# Modeling and scoring

Function that will fit our model and show auc roc score

In [None]:
def clf_fit_show_metric(clf, X_train, y_train, X_test, y_test):
    # Classifier
    classifier = OneVsRestClassifier(clf)
    classifier.fit(X_train, y_train)

    prob_pred = classifier.predict_proba(X_test)[:,1]
    print('auc roc score : ', roc_auc_score(y_test, prob_pred))

## Vectorizing

Let's start from 3000 features in tf-idf. Later we will check other values

In [None]:
%%time
# Tokenisation
vectorizer = TfidfVectorizer(stop_words=cachedStopWords,
                             tokenizer=tokenize)
 
# Learn and transform train documents
vectorised_train_data, vectorised_test_data, tfidf = tf_idf(train_data, test_data, 3000)

n = vectorised_train_data.shape[0] #how many raws we have in the dataset
print('there are ', n, ' raws in the dataset')
num_feats = vectorised_train_data.shape[1]
print('there are ', num_feats, ' features in the dataset')

there are  35000  raws in the dataset
there are  3000  features in the dataset
Wall time: 3min 14s


In [None]:
my_scorer = make_scorer(roc_auc_score)

# SGD classifier

Now we are finding best parameters for SGD classifier

In [None]:
Losf_values = ['log', 'modified_huber']
Alpha_values = [1e-5, 1e-4, 5e-4, 1e-3]
for Losf in Losf_values:  
    for Alpha in Alpha_values:
        print('Loss function : ', Losf)
        print('Alpha : ', Alpha)
        clf_fit_show_metric(SGDClassifier(loss=Losf, alpha=Alpha), 
                           vectorised_train_data, train_labels, 
                           vectorised_test_data, test_labels)
        print('\n')

Loss function :  log
Alpha :  1e-05
auc roc score :  0.813221186195


Loss function :  log
Alpha :  0.0001
auc roc score :  0.821616130875


Loss function :  log
Alpha :  0.0005
auc roc score :  0.814115456082


Loss function :  log
Alpha :  0.001
auc roc score :  0.80902832317


Loss function :  modified_huber
Alpha :  1e-05
auc roc score :  0.739094387847


Loss function :  modified_huber
Alpha :  0.0001
auc roc score :  0.809693887246


Loss function :  modified_huber
Alpha :  0.0005
auc roc score :  0.820085607962


Loss function :  modified_huber
Alpha :  0.001
auc roc score :  0.821231410241




The best model have auc roc score near 0,82. Loss function is 'modified_huber' and regularization constant alpha is 0.001

#### Finding best shape of tf-idf matrix

Let's make an experiment in which we will find the best shape of tf-idf matrix on the example of the best SGD classifier.

In [None]:
%%time

num_features = [1000, 2000, 3000, 5000, 7000, 10000, 15000, 20000]

for feats in num_features:
    print('Num feats : ', feats)
    # Learn and transform train documents
    vectorised_train_data, vectorised_test_data, tfidf = tf_idf(train_data, test_data, feats)

    n = vectorised_train_data.shape[0] #how many raws we have in the dataset
    print('there are ', n, ' raws in the dataset')
    num_feats = vectorised_train_data.shape[1]
    print('there are ', num_feats, ' features in the dataset')

    clf_fit_show_metric(SGDClassifier(loss='modified_huber', alpha=0.001), 
                               vectorised_train_data, train_labels, 
                               vectorised_test_data, test_labels)
    print('\n')

Num feats :  1000
there are  35000  raws in the dataset
there are  1000  features in the dataset
auc roc score :  0.805717012513


Num feats :  2000
there are  35000  raws in the dataset
there are  2000  features in the dataset
auc roc score :  0.816463692356


Num feats :  3000
there are  35000  raws in the dataset
there are  3000  features in the dataset
auc roc score :  0.821276644379


Num feats :  5000
there are  35000  raws in the dataset
there are  5000  features in the dataset
auc roc score :  0.825726319557


Num feats :  7000
there are  35000  raws in the dataset
there are  7000  features in the dataset
auc roc score :  0.827892047925


Num feats :  10000
there are  35000  raws in the dataset
there are  10000  features in the dataset
auc roc score :  0.829479181601


Num feats :  15000
there are  35000  raws in the dataset
there are  15000  features in the dataset
auc roc score :  0.830917811542


Num feats :  20000
there are  35000  raws in the dataset
there are  20000  feat

With an increase of number of features model becomes better, but after 10000 features profit is very small. So let's deal with 10000 features

In [None]:
# Learn and transform train documents
vectorised_train_data, vectorised_test_data, tfidf = tf_idf(train_data, test_data, 10000)

# Support Vector Classification (SVC)

In [None]:
%%time

clf_fit_show_metric( SVC(probability=True), 
                        vectorised_train_data, train_labels, 
                        vectorised_test_data, test_labels)

auc roc score :  0.825239733071
Wall time: 58min 57s


Computing of this methon is very long and result are pretty the same as SGD model gave

# Naive Bayes classification

In [None]:
NB_functions = [BernoulliNB, MultinomialNB]
for NB_func in NB_functions:
    print('NB function : ', NB_func)
    clf_fit_show_metric(NB_func(), 
                        vectorised_train_data, train_labels, 
                        vectorised_test_data, test_labels)
    print('\n')

NB function :  <class 'sklearn.naive_bayes.BernoulliNB'>
auc roc score :  0.775429017718


NB function :  <class 'sklearn.naive_bayes.MultinomialNB'>
auc roc score :  0.808277821996




Multinomial Naive Bayes have auc roc score about 0.81, that is a little worse than metric of the SGD model

# TruncatedSVD

Now let's try to reduce dimension of tf-idf sparse matrix

In [None]:
tsvd = TruncatedSVD(n_components = 1000)

Let's make a pipeline to make data preprocessing easier

In [None]:
preprocessing = Pipeline(steps = [('tfidf', tfidf), ('tsvd',tsvd)])

We will estimate efficiency of the decomposition on the example of SGD classifier with the best parameters we found above

In [None]:
%%time

sgd = SGDClassifier(loss='modified_huber', alpha=0.001)
bst_clf_pipe = Pipeline(steps=[('preprocessing', preprocessing),
                           ('sgd', sgd)
                    ])

clf_fit_show_metric(bst_clf_pipe, 
                               train_data, train_labels, 
                               test_data, test_labels)


auc roc score :  0.821033932538
Wall time: 3min 46s


Reducing the dimension of the matrix didn't make a profit but take more time than fitting without SVD

# Visualization

In [None]:
TextExpl = TextExplainer()

#### Let's find some presentative positive text

In [None]:
for i in range(0,15000):
    pos_text = test_data[i]
    if(bst_clf_pipe.predict_proba([pos_text])[:,1]>0.97):
        positive_index = i
        break

pos_text = test_data[positive_index]

print('Real class : ' , test_labels[positive_index])
print('Predicted class : ', bst_clf_pipe.predict([pos_text]))
print('Probabilities : ', bst_clf_pipe.predict_proba([pos_text]))

Real class :  0
Predicted class :  [1]
Probabilities :  [[0.01444528 0.98555472]]


In [None]:
feature_names_tfidf = bst_clf_pipe.named_steps['preprocessing'].named_steps['tfidf']

In [None]:
%%time

TextExpl.fit(pos_text, bst_clf_pipe.predict_proba)
display(TextExpl.show_prediction(target_names=feature_names_tfidf.get_feature_names()))

Contribution?,Feature
3.649,Highlighted in text (sum)
-0.153,<BIAS>


Wall time: 11.8 s


Here we can see such words as pleasantly, surprised (and they both together because we 2-grams), enjoyed, worth (with definitely). These words are charachterize this review as positive with probability of 0.97

#### Let's find some negative text

In [None]:
for i in range(0,15000):
    neg_text = test_data[i]
    if(bst_clf_pipe.predict_proba([neg_text])[:,0]>0.97):
        negative_index = i
        break

neg_text = test_data[negative_index]

print('Real class : ' , test_labels[negative_index])
print('Predicted class : ', bst_clf_pipe.predict([neg_text]))
print('Probabilities : ', bst_clf_pipe.predict_proba([neg_text]))

Real class :  0
Predicted class :  [0]
Probabilities :  [[1. 0.]]


In [None]:
%%time

TextExpl.fit(neg_text, bst_clf_pipe.predict_proba)
display(TextExpl.show_prediction(target_names=feature_names_tfidf.get_feature_names()))

Contribution?,Feature
5.258,Highlighted in text (sum)
0.342,<BIAS>


Wall time: 14.5 s


And here we see words disaster, poor, unbelievably. Interesting that there is a lot of speculation about the script of the movie. So this review is negative with great chance