# Modern Methods for Text Classification

In [1]:
# if you are using the fastAI environment, all of these imports work
import gzip
import os
from tqdm import tqdm
from urllib.request import urlretrieve
from pathlib import Path
import numpy as np
import pandas as pd

In [2]:
import spacy
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [3]:
class TqdmUpTo(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None: self.total = tsize
        self.update(b * bsize - self.n)

def get_data(url, filename):
    """
    Download data if the filename does not exist already
    Uses Tqdm to show download progress
    """
    if not os.path.exists(filename):

        dirname = os.path.dirname(filename)
        if not os.path.exists(dirname):
            os.makedirs(dirname)

        with TqdmUpTo(unit='B', unit_scale=True, miniters=1, desc=url.split('/')[-1]) as t:
            urlretrieve(url, filename, reporthook=t.update_to)

In [4]:
# Let's download some data:
data_url = 'http://files.fast.ai/data/aclImdb.tgz'
get_data(data_url, 'data/imdb.tgz')
# Manually extract the files above - your extractor depends on your Operating System. I used 7z on Windows and dtrx on Linux-Ubuntu16.04LTS. 

In [5]:
data_dir = Path(os.getcwd())/'data'/'imdb'/'aclImdb'
assert data_dir.exists()
for pathroute in os.walk(data_dir):
    next_path = pathroute[1]
    for stop in next_path:
        print(stop)

test
train
all
neg
pos
all
neg
pos
unsup


# Read data into separate dataframes/strings

In [6]:
train_path = data_dir/'train'
test_path = data_dir/'test'
assert train_path.exists()
assert test_path.exists()

In [7]:
def load_data(dir_path):
    
    def load_dir_reviews(reviews_path):    
        files_list = list(reviews_path.iterdir())
        reviews = []
        for filename in files_list:
            f = open(filename, 'r', encoding='utf-8')
            reviews.append(f.read())
        return pd.DataFrame({'text':reviews})
        
    
    pos_path = dir_path/'pos'
    neg_path = dir_path/'neg'
    pos_reviews, neg_reviews = load_dir_reviews(pos_path), load_dir_reviews(neg_path)
    pos_reviews['label'] = 1
    neg_reviews['label'] = 0
    merged = pd.concat([pos_reviews, neg_reviews])
    merged.reset_index(inplace=True)
    return merged

In [8]:
%time
train = load_data(train_path)
test = load_data(test_path)
X_train, y_train = train['text'], train['label']
X_test, y_test = test['text'], test['label']

Wall time: 0 ns


## Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression as LR

In [10]:
lr_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf',LR())])

In [11]:
%%time
lr_clf.fit(X=X_train, y=y_train)
lr_predicted = lr_clf.predict(X_test)
lr_acc = sum(lr_predicted == y_test)/len(lr_predicted)
print(lr_acc)

0.88316
Wall time: 10 s


**Let's  keep another model for reference**

In [12]:
lr_clf2 = Pipeline([('vect', CountVectorizer(ngram_range=(1,3))), ('tfidf', TfidfTransformer()), ('clf',LR())])
lr_clf2.fit(X=X_train, y=y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
        strip...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [13]:
lr2_predicted = lr_clf2.predict(X_test)
lr2_acc = sum(lr2_predicted == y_test)/len(lr_predicted)
print(lr2_acc)

0.87752


# Multinomial Naive Bayes

Note: Why is the above called Naive? There are more powerful and complex methods involving Bayesian approaches. 

In [14]:
from sklearn.naive_bayes import MultinomialNB as MNB

**Keeping the best performing model from Previous**

In [15]:
mnb_clf = Pipeline([('vect', CountVectorizer(stop_words='english', ngram_range=(1,3))), ('tfidf', TfidfTransformer()), ('clf',MNB())])

In [16]:
mnb_clf.fit(X=X_train, y=y_train)
mnb_predicted = mnb_clf.predict(X_test)
sum(mnb_predicted == y_test)/len(y_test)

0.8572

## Extra Trees Classifier 

**Keeping the best performing model from Previous**

In [17]:
from sklearn.ensemble import ExtraTreesClassifier as XTC
xtc_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf',XTC())])

In [18]:
xtc_clf.fit(X=X_train, y=y_train)
xtc_predicted = xtc_clf.predict(X_test)
sum(xtc_predicted == y_test)/len(y_test)

0.74304

In [19]:
samples = 20
xtc_predicted[:samples], mnb_predicted[:samples], lr_predicted[:samples]

(array([0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1],
       dtype=int64),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1],
       dtype=int64),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       dtype=int64))

We notice that the predictions don't agree with each other even on the first few samples. Let's write a function to take a simple majority prediction. 

In [20]:
def voting(predictions, style="majority"):
    count_p = len(predictions)  # number of predictions
    assert count_p > 1
    return np.asarray([int(x) for x in sum(predictions) > count_p // 2])

In [21]:
predictions = [lr_predicted, lr2_predicted, mnb_predicted]

In [22]:
simple_majority = voting(predictions)
sum(simple_majority == y_test)/len(y_test)

0.88596

In [23]:
np.corrcoef(lr2_predicted, lr_predicted)[0][1]

0.884299617053972

In [24]:
np.corrcoef(mnb_predicted, lr_predicted)[0][1]

0.7858874230493662

In [25]:
np.corrcoef(xtc_predicted, lr_predicted)[0][1]

0.5480712023729699

In [26]:
np.corrcoef(xtc_predicted, mnb_predicted)[0][1]

0.5403442772974716

In [27]:
np.corrcoef(xtc_predicted, lr2_predicted)[0][1]

0.5466513588106106

In [28]:
predictions = [mnb_predicted, mnb_predicted, xtc_predicted]

In [29]:
uncorrelated_majority = voting(predictions)
sum(uncorrelated_majority == y_test)/len(y_test)

0.8572

In [30]:
predictions = [lr_predicted, lr_predicted, lr2_predicted, mnb_predicted, xtc_predicted]

In [31]:
weighted_majority = voting(predictions)
sum(weighted_majority == y_test)/len(y_test)

0.88524

In [32]:
ifrom sklearn.ensemble import VotingClassifier

In [44]:
%%time
voting_clf = VotingClassifier(estimators=[('lr', lr_clf), ('rf', xtc_clf), ('mnb', mnb_clf)], voting='hard')
voting_clf.fit(X_train, y_train)

Wall time: 0 ns


VotingClassifier(estimators=[('lr', Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), pre...ar_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

In [45]:
voting_predictions = voting_clf.predict(X_test)
sum(voting_predictions == y_test)/len(y_test)

  if diff:


0.87452

In [51]:
%%time
voting_clf = VotingClassifier(estimators=[('lr', lr_clf), ('rf', xtc_clf), ('mnb', mnb_clf)], voting='soft')
voting_clf.fit(X_train, y_train)
voting_predictions = voting_clf.predict(X_test)
sum(voting_predictions == y_test)/len(y_test)

  if diff:


Wall time: 2min 42s


In [None]:
%%time
voting_clf = VotingClassifier(estimators=[('lr', lr_clf),('rf', xtc_clf), ('mnb', mnb_clf)], voting='soft')
voting_clf.fit(X_train, y_train)
voting_predictions = voting_clf.predict(X_test)
sum(voting_predictions == y_test)/len(y_test)

In [49]:
%%time
voting_clf = VotingClassifier(estimators=[('lr', lr_clf), ('lr2', lr_clf),('rf', xtc_clf), ('mnb2', mnb_clf),('mnb', mnb_clf)], voting='soft')
voting_clf.fit(X_train, y_train)
voting_predictions = voting_clf.predict(X_test)
sum(voting_predictions == y_test)/len(y_test)

Wall time: 0 ns


  if diff:


0.88536