# Text Classification

In [None]:
from pathlib import Path
import pandas as pd
import gzip
from urllib.request import urlretrieve
from tqdm import tqdm
import os
import numpy as np
# if you are using the fastAI environment, all of these imports work

In [None]:
class TqdmUpTo(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None: self.total = tsize
        self.update(b * bsize - self.n)

In [None]:
def get_data(url, filename):
    """
    Download data if the filename does not exist already
    Uses Tqdm to show download progress
    """
    if not os.path.exists(filename):

        dirname = os.path.dirname(filename)
        if not os.path.exists(dirname):
            os.makedirs(dirname)

        with TqdmUpTo(unit='B', unit_scale=True, miniters=1, desc=url.split('/')[-1]) as t:
            urlretrieve(url, filename, reporthook=t.update_to)

In [None]:
# Let's download some data:
data_url = 'http://files.fast.ai/data/aclImdb.tgz'
# get_data(data_url, 'data/imdb.tgz')

Before we proceed, *manually extract the files* please!
The *.tgz* extension is equivalent to *.tar.gz* here. 

On Windows, you might need a software like *7z* 
On Linux, you can probably use *tar -xvcf imdb.tgz* 

In [None]:
data_path = Path(os.getcwd())/'data'/'aclImdb'
assert data_path.exists()

This is to check that we have extracted the files at the correct location

In [None]:
for pathroute in os.walk(data_path):
    next_path = pathroute[1]
    for stop in next_path:
        print(stop)

In [None]:
train_path = data_path/'train'
test_path = data_path/'test'

In [None]:
def read_data(dir_path):
    """read data into pandas dataframe"""
    
    def load_dir_reviews(reviews_path):
        files_list = list(reviews_path.iterdir())
        reviews = []
        for filename in files_list:
            f = open(filename, 'r', encoding='utf-8')
            reviews.append(f.read())
        return pd.DataFrame({'text':reviews})
        
    
    pos_path = dir_path/'pos'
    neg_path = dir_path/'neg'
    
    pos_reviews, neg_reviews = load_dir_reviews(pos_path), load_dir_reviews(neg_path)
    
    pos_reviews['label'] = 1
    neg_reviews['label'] = 0
    
    merged = pd.concat([pos_reviews, neg_reviews])
    df = merged.sample(frac=1.0) # shuffle the rows
    df.reset_index(inplace=True) # don't carry index from previous
    df.drop(columns=['index'], inplace=True) # drop the column 'index' 
    return df

In [None]:
train_path = data_path/'train'
test_path = data_path/'test'

In [None]:
%%time
train = read_data(train_path)
test = read_data(test_path)

In [None]:
test[:5]

In [None]:
# test.to_csv(data_path/'test.csv', index=False)

In [None]:
# train.to_csv(data_path/'train.csv', index=False)

In [None]:
X_train, y_train = train['text'], train['label']
X_test, y_test = test['text'], test['label']

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression as LR

In [None]:
lr_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf',LR())])

In [None]:
%%time
lr_clf.fit(X=X_train, y=y_train) # note that .fit function calls are inplace, and the Pipeline is not re-assigned

In [None]:
lr_predicted = lr_clf.predict(X_test)

In [None]:
lr_acc = sum(lr_predicted == y_test)/len(lr_predicted)
lr_acc

In [None]:
def imdb_acc(pipeline_clf):
    predictions = pipeline_clf.predict(X_test)
    assert len(y_test) == len(predictions)
    return sum(predictions == y_test)/len(y_test), predictions

### Remove Stop Words

In [None]:
lr_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf',LR())])
lr_clf.fit(X=X_train, y=y_train)
lr_acc, lr_predictions = imdb_acc(lr_clf)
lr_acc

### Increase the Ngram Range

In [None]:
lr_clf = Pipeline([('vect', CountVectorizer(stop_words='english', ngram_range=(1,3))), ('tfidf', TfidfTransformer()), ('clf',LR())])
lr_clf.fit(X=X_train, y=y_train)
lr_acc, lr_predictions = imdb_acc(lr_clf)
lr_acc

# Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB as MNB
mnb_clf = Pipeline([('vect', CountVectorizer()), ('clf',MNB())])

In [None]:
mnb_clf.fit(X=X_train, y=y_train)
mnb_acc, mnb_predictions = imdb_acc(mnb_clf)
mnb_acc

### Add TF-IDF

In [None]:
mnb_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf',MNB())])
mnb_clf.fit(X=X_train, y=y_train)
mnb_acc, mnb_predictions = imdb_acc(mnb_clf)
mnb_acc

### Remove Stop Words

In [None]:
mnb_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf',MNB())])
mnb_clf.fit(X=X_train, y=y_train)
mnb_acc, mnb_predictions = imdb_acc(mnb_clf)
mnb_acc

### Add Ngram Range from 1 to 3

In [None]:
mnb_clf = Pipeline([('vect', CountVectorizer(stop_words='english', ngram_range=(1,3))), ('tfidf', TfidfTransformer()), ('clf',MNB())])
mnb_clf.fit(X=X_train, y=y_train)
mnb_acc, mnb_predictions = imdb_acc(mnb_clf)
mnb_acc

### Change Fit Prior to False

In [None]:
mnb_clf = Pipeline([('vect', CountVectorizer(stop_words='english', ngram_range=(1,3))), ('tfidf', TfidfTransformer()), ('clf',MNB(fit_prior=False))])
mnb_clf.fit(X=X_train, y=y_train)
mnb_acc, mnb_predictions = imdb_acc(mnb_clf)
mnb_acc

### Support Vector Machine

In [None]:
from sklearn.svm import SVC
svc_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf',SVC())])
svc_clf.fit(X=X_train, y=y_train)
svc_acc, svc_predictions = imdb_acc(svc_clf)
print(svc_acc) # 0.6562

## Tree Baseed Models

### Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier as DTC
dtc_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf',DTC())])
dtc_clf.fit(X=X_train, y=y_train)
dtc_acc, dtc_predictions = imdb_acc(dtc_clf)
dtc_acc

## Random Forest Classifier 

In [None]:
from sklearn.ensemble import RandomForestClassifier as RFC
rfc_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf',RFC())])
rfc_clf.fit(X=X_train, y=y_train)
rfc_acc, rfc_predictions = imdb_acc(rfc_clf)
rfc_acc

## Extra Trees Classifier 

In [None]:
from sklearn.ensemble import ExtraTreesClassifier as XTC
xtc_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf',XTC())])
xtc_clf.fit(X=X_train, y=y_train)
xtc_acc, xtc_predictions = imdb_acc(xtc_clf)
xtc_acc

# Automatically Fine Tuning 

### RandomizedSearch

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV
param_grid = dict(clf__C=[50, 75, 85, 100], 
                  vect__stop_words=['english', None],
                  vect__ngram_range = [(1, 1), (1, 3)],
                  vect__lowercase = [True, False],
                 )

In [None]:
random_search = RandomizedSearchCV(lr_clf, param_distributions=param_grid, n_iter=5, scoring='accuracy', n_jobs=-1, cv=3)
random_search.fit(X_train, y_train)

In [None]:
print(f'Calculated cross-validation accuracy: {random_search.best_score_}')

In [None]:
best_random_clf = random_search.best_estimator_

In [None]:
best_random_clf.fit(X_train, y_train)

In [None]:
imdb_acc(best_random_clf)


In [None]:
best_random_clf.steps

In [None]:
lr_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,3))), ('tfidf', TfidfTransformer()), ('clf',LR())])

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = dict(clf__C=[85, 100, 125, 150])
grid_search = GridSearchCV(lr_clf, param_grid=param_grid, scoring='accuracy', n_jobs=-1, cv=3)

In [None]:
%%time
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_estimator_.steps

In [None]:
print(f'Calculated cross-validation accuracy: {grid_search.best_score_} while random_search was {random_search.best_score_}')

In [None]:
%%time
best_grid_clf = grid_search.best_estimator_
best_grid_clf.fit(X_train, y_train)

In [None]:
imdb_acc(best_grid_clf)

# Ensemble Models 

## Voting Ensemble

### Simple Majority (aka Hard Voting)

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
%%time
voting_clf = VotingClassifier(estimators=[('xtc', xtc_clf), ('rfc', rfc_clf)], voting='hard', n_jobs=-1)
voting_clf.fit(X_train, y_train)

In [None]:
hard_voting_acc, _ = imdb_acc(voting_clf)
hard_voting_acc

#### Soft Voting

In [None]:
%%time
voting_clf = VotingClassifier(estimators=[('lr', lr_clf), ('mnb', mnb_clf)], voting='soft', n_jobs=-1)
voting_clf.fit(X_train, y_train)

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
soft_voting_acc, _ = imdb_acc(voting_clf)
soft_voting_acc

In [None]:
gain_acc = soft_voting_acc - lr_acc
if gain_acc > 0:
    print(f'We see that the soft voting gives us an absolute accuracy gain of {gain_acc*100:.2f}% ')

### Weighted Classifiers

In [None]:
%%time
weighted_voting_clf = VotingClassifier(estimators=[('lr', lr_clf), ('lr2', lr_clf),('rf', xtc_clf), ('mnb2', mnb_clf),('mnb', mnb_clf)], voting='soft', n_jobs=-1)
weighted_voting_clf.fit(X_train, y_train)

Repeat the experiment with 'hard' voting instead of 'soft' voting. This will tell you how does the voting strategy influence the accuracy of our ensembled classifier. 

In [None]:
weighted_voting_acc, _ = imdb_acc(weighted_voting_clf)
weighted_voting_acc

In [None]:
gain_acc = weighted_voting_acc - lr_acc
if gain_acc > 0:
    print(f'We see that the weighted voting gives us an absolute accuracy gain of {gain_acc*100:.2f}%')

In [None]:
np.corrcoef(mnb_predictions, lr_predictions)[0][1] # this is too high a correlation

In [None]:
%%time
corr_voting_clf = VotingClassifier(estimators=[('lr', lr_clf), ('mnb', mnb_clf)], voting='soft', n_jobs=-1)
corr_voting_clf.fit(X_train, y_train)
corr_acc, _ = imdb_acc(corr_voting_clf)
print(corr_acc)

In [None]:
np.corrcoef(dtc_predictions,xtc_predictions )[0][1] # this is looks like a low correlation

In [None]:
%%time
low_corr_voting_clf = VotingClassifier(estimators=[('dtc', dtc_clf), ('xtc', xtc_clf)], voting='soft', n_jobs=-1)
low_corr_voting_clf.fit(X_train, y_train)
low_corr_acc, _ = imdb_acc(low_corr_voting_clf)
print(low_corr_acc)