# Loading the needed libraries.

In [0]:
import pandas as pd
import numpy as np
from scipy import sparse

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

from tqdm import tqdm_notebook

import os
import itertools

import warnings
warnings.filterwarnings('ignore')

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.multiclass import OneVsRestClassifier

from sklearn.decomposition import IncrementalPCA as iPCA, TruncatedSVD

from sklearn.metrics import roc_auc_score, brier_score_loss, make_scorer
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.model_selection import train_test_split, GridSearchCV

In [0]:
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import re
from nltk.corpus import stopwords
 
cachedStopWords = stopwords.words("english")

import nltk

from collections import Counter
import string
from textblob import TextBlob, Word
from random import shuffle

from bs4 import BeautifulSoup
from eli5.lime import TextExplainer
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from tempfile import mkdtemp
from shutil import rmtree
from sklearn.externals.joblib import Memory

# Loading the data

Checking the path we are working in

In [0]:
os.getcwd()

'C:\\Users\\Oleg\\Desktop\\Универ\\EPAM\\DS\\Homework\\HW4'

In [0]:
HW4_path=os.getcwd()

Finding dataset

In [0]:
print(os.listdir())

['.ipynb_checkpoints', '04 Classification – Homework.docx', 'Dataset', 'HW4.ipynb', '~$ Classification – Homework.docx']


Creating paths to simplify movements between folders

In [0]:
data_path=HW4_path+"\\Dataset"

train_data_path=data_path+"\\train"
test_data_path=data_path+"\\test"

pos_train_data_path=train_data_path+"\\pos"
neg_train_data_path=train_data_path+"\\neg"

pos_test_data_path=test_data_path+"\\pos"
neg_test_data_path=test_data_path+"\\neg"

Load user reviews and their grades

In [0]:
os.chdir(pos_train_data_path)
pos_train_data=[]
pos_train_labels=[]

for textfile in os.listdir(pos_train_data_path):
    file = open( textfile, 'r', encoding="utf8")
    pos_train_data.append( file.read())
    
    _,label = textfile.split('_')
    label=int(label[0])
    pos_train_labels.append(label)

In [0]:
os.chdir(neg_train_data_path)
neg_train_data=[]
neg_train_labels=[]

for textfile in os.listdir(neg_train_data_path):
    file = open( textfile, 'r', encoding="utf8")
    neg_train_data.append( file.read())
    
    _,label = textfile.split('_')
    label=int(label[0])
    neg_train_labels.append(label)

In [0]:
os.chdir(pos_test_data_path)
pos_test_data=[]
pos_test_labels=[]

for textfile in os.listdir(pos_test_data_path):
    file = open( textfile, 'r', encoding="utf8")
    pos_test_data.append( file.read())
    
    _,label = textfile.split('_')
    label=int(label[0])
    pos_test_labels.append(label)

In [0]:
os.chdir(neg_test_data_path)
neg_test_data=[]
neg_test_labels=[]

for textfile in os.listdir(neg_test_data_path):
    file = open( textfile, 'r', encoding="utf8")
    neg_test_data.append( file.read())
    
    _,label = textfile.split('_')
    label=int(label[0])
    neg_test_labels.append(label)

In [0]:
pos_train_data_pd = pd.DataFrame(data=pos_train_data, columns=['review'])
neg_train_data_pd = pd.DataFrame(data=neg_train_data, columns=['review'])

pos_test_data_pd = pd.DataFrame(data=pos_test_data, columns=['review'])
neg_test_data_pd = pd.DataFrame(data=neg_test_data, columns=['review'])

# Feature extraction

#### Number of words

Check if number of words in review can predict the grade

In [0]:
pos_train_data_pd['word_count'] = pos_train_data_pd['review'].apply(lambda x: len(str(x).split(" ")))
print('Average number of words in review : ', pos_train_data_pd['word_count'].mean())

Average number of words in review :  236.69568


In [0]:
neg_train_data_pd['word_count'] = neg_train_data_pd['review'].apply(lambda x: len(str(x).split(" ")))
print('Average number of words in review : ', neg_train_data_pd['word_count'].mean())

Average number of words in review :  230.85776


For negative and positive comments number of words is almost identical

#### Average word length

In [0]:
def avg_word(review):
    words = review.split()
    return (sum(len(word) for word in words)/len(words))

pos_train_data_pd['avg_word'] = pos_train_data_pd['review'].apply(lambda x: avg_word(x))
print('Average word length in review : ', pos_train_data_pd['avg_word'].mean())

Average word length in review :  4.660274983731835


In [0]:
neg_train_data_pd['avg_word'] = neg_train_data_pd['review'].apply(lambda x: avg_word(x))
print('Average word length in review : ', neg_train_data_pd['avg_word'].mean())

Average word length in review :  4.627199126537517


There is no difference here

#### Number of stopwords

Before changing and removing the stopwords let's try to find some patterns with default list of stopwords from NLTK library

In [0]:
pos_train_data_pd['stopwords'] = pos_train_data_pd['review'].apply(lambda x: len([x for x in x.split() if x in cachedStopWords]))
print('Average number of stopwords in review : ', pos_train_data_pd['stopwords'].mean())
print('Average part of stopwords in review : ', pos_train_data_pd['stopwords'].mean() / pos_train_data_pd['word_count'].mean())

Average number of stopwords in review :  97.12576
Average part of stopwords in review :  0.410340230966615


In [0]:
neg_train_data_pd['stopwords'] = neg_train_data_pd['review'].apply(lambda x: len([x for x in x.split() if x in cachedStopWords]))
print('Average number of stopwords in review : ', neg_train_data_pd['stopwords'].mean())
print('Average part of stopwords in review : ', neg_train_data_pd['stopwords'].mean() / neg_train_data_pd['word_count'].mean())

Average number of stopwords in review :  94.63528
Average part of stopwords in review :  0.4099289536552724


And now there is nothing notable

#### Number of swear words

In [0]:
#collecting swear words (shifted to the right) and present them in two cases: low and with capital letter
swear_words=[                                                                                                                                        'Bastard', 'Beaver', 'Bellend', 'Bloodclaat', 'Clunge', 'Cock', 'Dick', 'Dickhead', 'Fanny', 'Flaps', 'Gash', 'Knob', 'Minge', 'Prick', 'Punani', 'Pussy', 'Snatch', 'Twat', 'Cunt', 'Fuck', 'Motherfucker', 'Arsehole', 'Balls', 'Bint', 'Bitch', 'Bollocks', 'Bullshit', 'Feck', 'Munter', 'pissed off', 'Shit', 'Son of a bitch', 'Tits']
swear_words += [word.lower() for word in swear_words]

pos_train_data_pd['swear_words'] = pos_train_data_pd['review'].apply(lambda x: len([x for x in x.split() if x in swear_words]))
print('Average number of swear_words in review : ', pos_train_data_pd['swear_words'].mean())
print('Average part of swear_words in review : ', pos_train_data_pd['swear_words'].mean() / pos_train_data_pd['word_count'].mean())

Average number of swear_words in review :  0.01936
Average part of swear_words in review :  8.179278979658606e-05


In [0]:
neg_train_data_pd['swear_words'] = neg_train_data_pd['review'].apply(lambda x: len([x for x in x.split() if x in swear_words]))
print('Average number of swear_words in review : ', neg_train_data_pd['swear_words'].mean())
print('Average part of swear_words in review : ', neg_train_data_pd['swear_words'].mean() / neg_train_data_pd['word_count'].mean())

Average number of swear_words in review :  0.0168
Average part of swear_words in review :  7.277208268849181e-05


Both categories have similar frequencies (very small) of swear words

#### Number of numerics

In [0]:
pos_train_data_pd['numerics'] = pos_train_data_pd['review'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
print('Average number of numerics in review : ', pos_train_data_pd['numerics'].mean())
print('Average part of numerics in review : ', pos_train_data_pd['numerics'].mean() / pos_train_data_pd['word_count'].mean())

Average number of numerics in review :  0.5392
Average part of numerics in review :  0.0022780305918553307


In [0]:
neg_train_data_pd['numerics'] = neg_train_data_pd['review'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
print('Average number of numerics in review : ', neg_train_data_pd['numerics'].mean())
print('Average part of numerics in review : ', neg_train_data_pd['numerics'].mean() / neg_train_data_pd['word_count'].mean())

Average number of numerics in review :  0.61936
Average part of numerics in review :  0.0026828641151157317


Parts of numerics in reviews are small and that there is no visible differense betweed categories

#### Number of Uppercase words (CAPS)

In [0]:
pos_train_data_pd['upper'] = pos_train_data_pd['review'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
print('Average number of upper words in review : ', pos_train_data_pd['upper'].mean())
print('Average part of upper words in review : ', pos_train_data_pd['upper'].mean() / pos_train_data_pd['word_count'].mean())

Average number of upper words in review :  4.53352
Average part of upper words in review :  0.019153370268523703


In [0]:
neg_train_data_pd['upper'] = neg_train_data_pd['review'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
print('Average number of upper words in review : ', neg_train_data_pd['upper'].mean())
print('Average part of upper words in review : ', neg_train_data_pd['upper'].mean() / neg_train_data_pd['word_count'].mean())

Average number of upper words in review :  5.16712
Average part of upper words in review :  0.022382266898890467


And now there is nothing suspicious

#### Number of punctuation marks

In [0]:
punctuation_marks = ['...', ',', '?', '!', ':', ';', '"', '\'', '-', '.', '–', '—']

pos_train_data_pd['punctuation_marks'] = pos_train_data_pd['review'].apply(lambda x: sum([1 for x in x if x in punctuation_marks]))
print('Average number of punctuation marks in review : ', pos_train_data_pd['punctuation_marks'].mean())

Average number of punctuation marks in review :  36.55952


In [0]:
neg_train_data_pd['punctuation_marks'] = neg_train_data_pd['review'].apply(lambda x: sum([1 for x in x if x in punctuation_marks]))
print('Average number of punctuation marks in review : ', neg_train_data_pd['punctuation_marks'].mean())

Average number of punctuation marks in review :  37.66912


Unfortunately this perspective assumption was not justified too

#### Difference between positive and negative smiles

In [0]:
def mood_counter(text):
    braces = 0
    
    for i in text:
    
        if i == ')':
            braces += 1
        
        elif i == '(':
            braces -= 1
    
    return braces

In [0]:
pos_train_data_pd['mood'] = pos_train_data_pd['review'].apply(lambda x: mood_counter(x))
print('Average difference between positive and negative smiles in review : ', pos_train_data_pd['mood'].mean())

Average difference between positive and negative smiles in review :  0.02048


In [0]:
neg_train_data_pd['mood'] = neg_train_data_pd['review'].apply(lambda x: mood_counter(x))
print('Average difference between positive and negative smiles in review : ', neg_train_data_pd['mood'].mean())

Average difference between positive and negative smiles in review :  0.04176


Values are to small but negative reviews twice more positive than positive reviews)) So we will not delete smile or sad brackets 

#### Sentiment

In [0]:
pos_train_data_pd['sentiment'] = pos_train_data_pd['review'].apply(lambda x: TextBlob(x).sentiment[0] )
neg_train_data_pd['sentiment'] = neg_train_data_pd['review'].apply(lambda x: TextBlob(x).sentiment[0] )

print('positive review average sentiment : ', pos_train_data_pd['sentiment'].mean())
print('negative review average sentiment : ', neg_train_data_pd['sentiment'].mean())

positive review average sentiment :  0.19304342622068107
negative review average sentiment :  0.013336767750285623


There is notable difference between senses in these review categories

# Preprocessing

#### Lower case

In [0]:
pos_train_data_pd['review'] = pos_train_data_pd['review'].apply(lambda x: " ".join(x.lower() for x in x.split()))
neg_train_data_pd['review'] = neg_train_data_pd['review'].apply(lambda x: " ".join(x.lower() for x in x.split()))

pos_test_data_pd['review'] = pos_test_data_pd['review'].apply(lambda x: " ".join(x.lower() for x in x.split()))
neg_test_data_pd['review'] = neg_test_data_pd['review'].apply(lambda x: " ".join(x.lower() for x in x.split()))

#### Removing Punctuation

Do not forget that we found some relation between class and brackets

In [0]:
pos_train_data_pd['review'] = pos_train_data_pd['review'].str.replace('[^\w\s()]','')
neg_train_data_pd['review'] = neg_train_data_pd['review'].str.replace('[^\w\s()]','')

pos_test_data_pd['review'] = pos_test_data_pd['review'].str.replace('[^\w\s()]','')
neg_test_data_pd['review'] = neg_test_data_pd['review'].str.replace('[^\w\s()]','')

#### Removing common words

Let's find commonly occuring words which may not be in stopword list

In [0]:
pos_freq = pd.Series(' '.join(pos_train_data_pd['review']).split()).value_counts()
pos_freq[:25]

the      171441
and       88009
a         82810
of        76495
to        66315
is        56974
in        49445
it        38213
i         35251
this      34650
that      33946
br        27474
as        25782
with      22981
for       22131
was       21817
but       20300
film      19468
movie     18064
his       17120
on        16586
are       14748
he        14507
you       14478
not       13939
dtype: int64

In [0]:
neg_freq = pd.Series(' '.join(neg_train_data_pd['review']).split()).value_counts()
neg_freq[:25]

the      161787
a         78498
and       72601
of        68650
to        68579
is        49763
in        43094
this      40321
i         39703
it        38401
that      35131
br        29576
was       26146
movie     23511
for       21573
but       21207
with      20562
as        20176
film      17704
on        16791
not       15856
have      15103
you       14863
are       14543
be        14350
dtype: int64

First 25 words in both lists are similar and we can delete them. But what if not only first 25 are similar? There may be more

In [0]:
#merge lists of 25 common words
#common_words = list(pos_freq[:25].index) + list(neg_freq[:25].index)
common_words = list(pos_freq[:150].index) + list(neg_freq[:150].index)
#remove duplicate elements
common_words = list(set(common_words))
len(common_words)

167

Length 150 means that top-150 words in both categories are the same, length 300 means that top words are completely don't match. Length of the obtained merge list allow us to be sure that the most common words in both categories are almost the same. 

Deleting the most common words

In [0]:
pos_train_data_pd['review'] = pos_train_data_pd['review'].apply(lambda x: " ".join(x for x in x.split() if x not in common_words))
neg_train_data_pd['review'] = neg_train_data_pd['review'].apply(lambda x: " ".join(x for x in x.split() if x not in common_words))

pos_test_data_pd['review'] = pos_test_data_pd['review'].apply(lambda x: " ".join(x for x in x.split() if x not in common_words))
neg_test_data_pd['review'] = neg_test_data_pd['review'].apply(lambda x: " ".join(x for x in x.split() if x not in common_words))

#### Removing of Stop Words

In [0]:
pos_train_data_pd['review'] = pos_train_data_pd['review'].apply(lambda x: " ".join(x for x in x.split() if x not in cachedStopWords))
neg_train_data_pd['review'] = neg_train_data_pd['review'].apply(lambda x: " ".join(x for x in x.split() if x not in cachedStopWords))

pos_test_data_pd['review'] = pos_test_data_pd['review'].apply(lambda x: " ".join(x for x in x.split() if x not in cachedStopWords))
neg_test_data_pd['review'] = neg_test_data_pd['review'].apply(lambda x: " ".join(x for x in x.split() if x not in cachedStopWords))

#### Removing HTML markup and metadata

In [0]:
pos_train_data_pd['review'] = pos_train_data_pd['review'].apply(lambda x:  BeautifulSoup(x, 'html.parser').get_text())
neg_train_data_pd['review'] = neg_train_data_pd['review'].apply(lambda x:  BeautifulSoup(x, 'html.parser').get_text())

pos_test_data_pd['review'] = pos_test_data_pd['review'].apply(lambda x:  BeautifulSoup(x, 'html.parser').get_text())
neg_test_data_pd['review'] = neg_test_data_pd['review'].apply(lambda x:  BeautifulSoup(x, 'html.parser').get_text())

#### Tokenization

In [0]:
def tokenize(text):
    min_length = 3
    words = map(lambda word: word.lower(), word_tokenize(text));
   
    tokens =(list(map(lambda token: PorterStemmer().stem(token), words)));
    
    #wn_lemmatizer = WordNetLemmatizer()
    #tokens = [wn_lemmatizer.lemmatize(token) for token in words]
    
    p = re.compile('[a-zA-Z]+');
    filtered_tokens = list(filter(lambda token:
                  p.match(token) and len(token)>=min_length,
         tokens));
    return filtered_tokens


def tf_idf(train_data, test_data=None, max_feats=50000):
    tfidf = TfidfVectorizer(tokenizer=tokenize, 
                            min_df=3,
                            max_df=0.90, 
                            max_features=max_feats,
                            use_idf=True, 
                            sublinear_tf=True,
                            norm='l2',
                            ngram_range = (1,3));
    
    vectorised_train_data = tfidf.fit_transform(train_data)
    if test_data==None:
        return vectorised_train_data, tfidf
    else:
        vectorised_test_data = tfidf.transform(test_data)
        return vectorised_train_data,vectorised_test_data, tfidf

#### Splitting the data

The default split is 50/50 and now we wil make it 70/30. Don't forget to shuffle data

In [0]:
train_data = list(pos_train_data_pd['review']) + list(neg_train_data_pd['review'])
test_data = list(pos_test_data_pd['review']) + list(neg_test_data_pd['review'])
all_data = train_data + test_data

train_labels = pos_train_labels + neg_train_labels
test_labels = pos_test_labels + neg_test_labels
all_labels = train_labels + test_labels

train_data, test_data, train_labels, test_labels = train_test_split(all_data, all_labels, test_size=0.3, 
                                                                    stratify=all_labels, shuffle=True,
                                                                   random_state=10)

Let's binarize labels

In [0]:
train_labels = [1 if x in [7,8,9] else 0 for x in train_labels]
test_labels = [1 if x in [7,8,9] else 0 for x in test_labels]

# Modeling and scoring

Function that will fit our model and show auc roc score

In [0]:
def clf_fit_show_metric(clf, X_train, y_train, X_test, y_test):
    # Classifier
    classifier = OneVsRestClassifier(clf)
    classifier.fit(X_train, y_train)

    prob_pred = classifier.predict_proba(X_test)[:,1]
    print('auc roc score : ', roc_auc_score(y_test, prob_pred))

## Vectorizing

Let's start from 3000 features in tf-idf. Later we will check other values

In [0]:
%%time
# Tokenisation
vectorizer = TfidfVectorizer(stop_words=cachedStopWords,
                             tokenizer=tokenize)
 
# Learn and transform train documents
vectorised_train_data, vectorised_test_data, tfidf = tf_idf(train_data, test_data, 3000)

n = vectorised_train_data.shape[0] #how many raws we have in the dataset
print('there are ', n, ' raws in the dataset')
num_feats = vectorised_train_data.shape[1]
print('there are ', num_feats, ' features in the dataset')

there are  35000  raws in the dataset
there are  3000  features in the dataset
Wall time: 3min 14s


In [0]:
my_scorer = make_scorer(roc_auc_score)

# SGD classifier

Now we are finding best parameters for SGD classifier

In [0]:
Losf_values = ['log', 'modified_huber']
Alpha_values = [1e-5, 1e-4, 5e-4, 1e-3]
for Losf in Losf_values:  
    for Alpha in Alpha_values:
        print('Loss function : ', Losf)
        print('Alpha : ', Alpha)
        clf_fit_show_metric(SGDClassifier(loss=Losf, alpha=Alpha), 
                           vectorised_train_data, train_labels, 
                           vectorised_test_data, test_labels)
        print('\n')

Loss function :  log
Alpha :  1e-05
auc roc score :  0.813221186195


Loss function :  log
Alpha :  0.0001
auc roc score :  0.821616130875


Loss function :  log
Alpha :  0.0005
auc roc score :  0.814115456082


Loss function :  log
Alpha :  0.001
auc roc score :  0.80902832317


Loss function :  modified_huber
Alpha :  1e-05
auc roc score :  0.739094387847


Loss function :  modified_huber
Alpha :  0.0001
auc roc score :  0.809693887246


Loss function :  modified_huber
Alpha :  0.0005
auc roc score :  0.820085607962


Loss function :  modified_huber
Alpha :  0.001
auc roc score :  0.821231410241




The best model have auc roc score near 0,82. Loss function is 'modified_huber' and regularization constant alpha is 0.001

#### Finding best shape of tf-idf matrix

Let's make an experiment in which we will find the best shape of tf-idf matrix on the example of the best SGD classifier.

In [0]:
%%time

num_features = [1000, 2000, 3000, 5000, 7000, 10000, 15000, 20000]

for feats in num_features:
    print('Num feats : ', feats)
    # Learn and transform train documents
    vectorised_train_data, vectorised_test_data, tfidf = tf_idf(train_data, test_data, feats)

    n = vectorised_train_data.shape[0] #how many raws we have in the dataset
    print('there are ', n, ' raws in the dataset')
    num_feats = vectorised_train_data.shape[1]
    print('there are ', num_feats, ' features in the dataset')

    clf_fit_show_metric(SGDClassifier(loss='modified_huber', alpha=0.001), 
                               vectorised_train_data, train_labels, 
                               vectorised_test_data, test_labels)
    print('\n')

Num feats :  1000
there are  35000  raws in the dataset
there are  1000  features in the dataset
auc roc score :  0.805717012513


Num feats :  2000
there are  35000  raws in the dataset
there are  2000  features in the dataset
auc roc score :  0.816463692356


Num feats :  3000
there are  35000  raws in the dataset
there are  3000  features in the dataset
auc roc score :  0.821276644379


Num feats :  5000
there are  35000  raws in the dataset
there are  5000  features in the dataset
auc roc score :  0.825726319557


Num feats :  7000
there are  35000  raws in the dataset
there are  7000  features in the dataset
auc roc score :  0.827892047925


Num feats :  10000
there are  35000  raws in the dataset
there are  10000  features in the dataset
auc roc score :  0.829479181601


Num feats :  15000
there are  35000  raws in the dataset
there are  15000  features in the dataset
auc roc score :  0.830917811542


Num feats :  20000
there are  35000  raws in the dataset
there are  20000  feat

With an increase of number of features model becomes better, but after 10000 features profit is very small. So let's deal with 10000 features

In [0]:
# Learn and transform train documents
vectorised_train_data, vectorised_test_data, tfidf = tf_idf(train_data, test_data, 10000)

# Support Vector Classification (SVC)

In [0]:
%%time

clf_fit_show_metric( SVC(probability=True), 
                        vectorised_train_data, train_labels, 
                        vectorised_test_data, test_labels)

auc roc score :  0.825239733071
Wall time: 58min 57s


Computing of this methon is very long and result are pretty the same as SGD model gave

# Naive Bayes classification

In [0]:
NB_functions = [BernoulliNB, MultinomialNB]
for NB_func in NB_functions:
    print('NB function : ', NB_func)
    clf_fit_show_metric(NB_func(), 
                        vectorised_train_data, train_labels, 
                        vectorised_test_data, test_labels)
    print('\n')

NB function :  <class 'sklearn.naive_bayes.BernoulliNB'>
auc roc score :  0.775429017718


NB function :  <class 'sklearn.naive_bayes.MultinomialNB'>
auc roc score :  0.808277821996




Multinomial Naive Bayes have auc roc score about 0.81, that is a little worse than metric of the SGD model

# TruncatedSVD

Now let's try to reduce dimension of tf-idf sparse matrix

In [0]:
tsvd = TruncatedSVD(n_components = 1000)

Let's make a pipeline to make data preprocessing easier

In [0]:
preprocessing = Pipeline(steps = [('tfidf', tfidf), ('tsvd',tsvd)])

We will estimate efficiency of the decomposition on the example of SGD classifier with the best parameters we found above

In [0]:
%%time

sgd = SGDClassifier(loss='modified_huber', alpha=0.001)
bst_clf_pipe = Pipeline(steps=[('preprocessing', preprocessing),
                           ('sgd', sgd)
                    ])

clf_fit_show_metric(bst_clf_pipe, 
                               train_data, train_labels, 
                               test_data, test_labels)


auc roc score :  0.821033932538
Wall time: 3min 46s


Reducing the dimension of the matrix didn't make a profit but take more time than fitting without SVD

# Visualization

In [0]:
TextExpl = TextExplainer()

#### Let's find some presentative positive text

In [0]:
for i in range(0,15000):
    pos_text = test_data[i]
    if(bst_clf_pipe.predict_proba([pos_text])[:,1]>0.97):
        positive_index = i
        break

pos_text = test_data[positive_index]

print('Real class : ' , test_labels[positive_index])
print('Predicted class : ', bst_clf_pipe.predict([pos_text]))
print('Probabilities : ', bst_clf_pipe.predict_proba([pos_text]))

Real class :  0
Predicted class :  [1]
Probabilities :  [[0.01444528 0.98555472]]


In [0]:
feature_names_tfidf = bst_clf_pipe.named_steps['preprocessing'].named_steps['tfidf']

In [0]:
%%time

TextExpl.fit(pos_text, bst_clf_pipe.predict_proba)
display(TextExpl.show_prediction(target_names=feature_names_tfidf.get_feature_names()))

Contribution?,Feature
3.649,Highlighted in text (sum)
-0.153,<BIAS>


Wall time: 11.8 s


Here we can see such words as pleasantly, surprised (and they both together because we 2-grams), enjoyed, worth (with definitely). These words are charachterize this review as positive with probability of 0.97

#### Let's find some negative text

In [0]:
for i in range(0,15000):
    neg_text = test_data[i]
    if(bst_clf_pipe.predict_proba([neg_text])[:,0]>0.97):
        negative_index = i
        break

neg_text = test_data[negative_index]

print('Real class : ' , test_labels[negative_index])
print('Predicted class : ', bst_clf_pipe.predict([neg_text]))
print('Probabilities : ', bst_clf_pipe.predict_proba([neg_text]))

Real class :  0
Predicted class :  [0]
Probabilities :  [[1. 0.]]


In [0]:
%%time

TextExpl.fit(neg_text, bst_clf_pipe.predict_proba)
display(TextExpl.show_prediction(target_names=feature_names_tfidf.get_feature_names()))

Contribution?,Feature
5.258,Highlighted in text (sum)
0.342,<BIAS>


Wall time: 14.5 s


And here we see words disaster, poor, unbelievably. Interesting that there is a lot of speculation about the script of the movie. So this review is negative with great chance