### Libraries

In [None]:
!pip install datasets transformers

Collecting datasets
  Downloading datasets-1.17.0-py3-none-any.whl (306 kB)
[?25l[K     |█                               | 10 kB 22.6 MB/s eta 0:00:01[K     |██▏                             | 20 kB 29.2 MB/s eta 0:00:01[K     |███▏                            | 30 kB 26.2 MB/s eta 0:00:01[K     |████▎                           | 40 kB 20.0 MB/s eta 0:00:01[K     |█████▍                          | 51 kB 15.9 MB/s eta 0:00:01[K     |██████▍                         | 61 kB 17.7 MB/s eta 0:00:01[K     |███████▌                        | 71 kB 15.1 MB/s eta 0:00:01[K     |████████▋                       | 81 kB 16.3 MB/s eta 0:00:01[K     |█████████▋                      | 92 kB 14.9 MB/s eta 0:00:01[K     |██████████▊                     | 102 kB 14.3 MB/s eta 0:00:01[K     |███████████▊                    | 112 kB 14.3 MB/s eta 0:00:01[K     |████████████▉                   | 122 kB 14.3 MB/s eta 0:00:01[K     |██████████████                  | 133 kB 14.3 MB/s et

In [None]:
# NLP
# from datasets import load_dataset, list_datasets

import spacy

# ML
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.dummy import DummyClassifier
from joblib import dump, load
from tqdm import tqdm

# Data Manipultion
import numpy as np
import pandas as pd
# Visualization
import matplotlib.pyplot as plt

from IPython.display import Audio, display

from pathlib import Path
import glob

# DL
# from transformers import DistilBertTokenizer, DistilBertModel, AutoModel, AutoTokenizer
import torch
from torch.utils.data import TensorDataset, DataLoader

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
check = Path.cwd()/'gdrive/MyDrive/TMProject'
files = list(check.glob("**/*.npy"))

Mounted at /content/gdrive


### Helper Functions

In [None]:
def paired_bootstrap_test(test_set, model1,model2, B, score,*args,**kwargs):
    """
    Function to generate \delta(x) and \delta(x^{(i)}) for B bootstrap samples.
    Reference: 
        Jurafsky, Daniel, and James H. Martin. "Speech and Language Processing: An Introduction to Natural Language Processing, Computational Linguistics, and Speech Recognition."
    Params:
    ------
    test_set: np.ndarray, Array of test outputs.
    model1: np.ndarray, Array of Model A's output.
    model2: np.ndarray, Array of Model B's output.
    B : int, No of Bootstrap's to be generated
    score: Evaluation algorithm.
    """
    N = test_set.shape[0]
    score1 =  score(test_set, model1, *args,**kwargs)
    score2 =  score(test_set, model2, *args,**kwargs)
    delta = score1-score2
    deltas = [] # for storing \delta(x) of bootstraps
    for boot in tqdm(range(B)):
        ind = np.random.randint(low=0, high=N, size=N)
        sampleY = test_set[ind,:]
        sample1 = model1[ind,:]
        sc1 = score(sampleY, sample1,*args,**kwargs)
        sample2 = model2[ind,:]
        sc2 = score(sampleY, sample2, *args, **kwargs)
        delta_b = sc1 - sc2
        deltas.append(delta_b)
    deltas = np.array(deltas)
    return (deltas, delta)

def hypothesis_test(dx_i,dx, significance=0.05, hypothesis=""):
    """
    Implementation of paired-bootstrap test.
    Reference: Berg-Kirkpatrick, et. al. "An empirical investigation of statistical significance in nlp."
    """
    p_value = np.mean(dx_i>= (2*dx))
    if p_value<significance:
        print(f"We reject the null hypothesis {hypothesis} at a significance of {significance}")
    else:
        print(f"We fail to reject the Null Hypothesis {hypothesis} at a significance of {significance}")
    return p_value

In [None]:
def get_data(*args, **kwargs):
    """Function to load Dataset"""
    reuters = load_dataset("reuters21578","ModApte") # ModApte b'coz of "A re-examination of text categorization methods" paper
    train = reuters['train'] # Same as paper
    test = reuters['test'] # Same as paper
    train.set_format(type = "pandas")
    test.set_format(type = "pandas")   
    df_train = train[:]
    df_test = test[:]
    # Get empty Indexes: required for dealing with Tensored datasets
    empty_train = df_train.index[df_train.topics.str.len().eq(0)] 
    empty_test = df_test.index[df_test.topics.str.len().eq(0)]

    df_train = df_train[~df_train.topics.str.len().eq(0)] # Drop Empty Topics
    df_test = df_test[~df_test.topics.str.len().eq(0)] # Drop Empty Topics
    cols =df_train.columns

    df_train = df_train.drop([col for col in cols if col not in ['text', 'topics']], axis=1)
    df_test = df_test.drop([col for col in cols if col not in ['text', 'topics']], axis=1)

    X_train = df_train['text']
    X_test = df_test['text']
    mlb = MultiLabelBinarizer()
    y_train = mlb.fit_transform(df_train.topics)
    y_test = mlb.transform(df_test.topics)
    return X_train,X_test,y_train,y_test, empty_train,empty_test

def spacy_tokenizer(text):
    """SpaCy Tokenizer pipeline"""
    doc = nlp(text, disable=['ner', 'tagger', 'parser'])
    return [token.text for token in doc if not token.is_stop and not token.is_punct]

# def allDone():
#   display(Audio(url='https://sound.peal.io/ps/audios/000/000/537/original/woo_vu_luvub_dub_dub.wav', autoplay=True))


### Data Loading

In [None]:
xtrain,xtest,ytrain,ytest, _,_ = get_data()

Downloading:   0%|          | 0.00/4.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

Downloading and preparing dataset reuters21578/ModApte (download: 7.77 MiB, generated: 12.48 MiB, post-processed: Unknown size, total: 20.25 MiB) to /root/.cache/huggingface/datasets/reuters21578/ModApte/1.0.0/98a2ad6a0242627562db83992f9625261854c40a88619322596153a5a16a206c...


Downloading:   0%|          | 0.00/8.15M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset reuters21578 downloaded and prepared to /root/.cache/huggingface/datasets/reuters21578/ModApte/1.0.0/98a2ad6a0242627562db83992f9625261854c40a88619322596153a5a16a206c. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

  "unknown class(es) {0} will be ignored".format(sorted(unknown, key=str))


In [None]:
tfidf = TfidfVectorizer(analyzer='word', stop_words='english')

In [None]:
train_tfidf = tfidf.fit_transform(xtrain)

In [None]:
test_tfidf = tfidf.transform(xtest)

### DL based Data Loading

In [None]:
xtrain,xtest,ytrain, ytest, empty_train,empty_test = get_data()

Reusing dataset reuters21578 (/root/.cache/huggingface/datasets/reuters21578/ModApte/1.0.0/98a2ad6a0242627562db83992f9625261854c40a88619322596153a5a16a206c)


  0%|          | 0/3 [00:00<?, ?it/s]

  "unknown class(es) {0} will be ignored".format(sorted(unknown, key=str))


In [None]:
xtrain_bert.shape, xtest_bert.shape

((9603, 768), (3299, 768))

In [None]:
xtest_bert = np.load(check/'bert_embedded_test.npy')
xtrain_bert = np.load(check/'bert_embedded_train.npy')

# Filter empty values
xtest_bert = np.delete(xtest_bert, empty_test, 0)
xtrain_bert = np.delete(xtrain_bert, empty_train, 0)

ytrain_dl = torch.from_numpy(ytrain)
ytest_dl = torch.from_numpy(ytest)

In [None]:
# Model Definition
dtree = DecisionTreeClassifier()
rftree = RandomForestClassifier(n_estimators=10)
adaboost = AdaBoostClassifier(n_estimators=10)

In [None]:
# Model wrt Multiclass
clf_dtree = OneVsRestClassifier(dtree)
clf_rf = OneVsRestClassifier(rftree,n_jobs=2)
clf_adab = OneVsRestClassifier(adaboost, n_jobs=2)

In [None]:
# # Model fitting
clf_dtree.fit(xtrain_bert, ytrain)
ytrain_dt = clf_dtree.predict(xtrain_bert)
np.save(f"{check}/ytrain_dt.npy",ytrain_dt)

In [None]:
clf_rf.fit(xtrain_bert, ytrain)

OneVsRestClassifier(estimator=RandomForestClassifier(n_estimators=10), n_jobs=2)

In [None]:
clf_adab.fit(xtrain_bert, ytrain)

OneVsRestClassifier(estimator=AdaBoostClassifier(n_estimators=10), n_jobs=2)

In [None]:
ytrain_rf = clf_rf.predict(xtrain_bert)
ytrain_adab = clf_adab.predict(xtrain_bert)

In [None]:
# Save others
np.save(f"{check}/ytrain_rf.npy", ytrain_rf)
np.save(f"{check}/ytrain_adab.npy", ytrain_adab)

### Modeling

### Parameter Setting

In [None]:
# Params
jobs=2
neighbors = 45
kernel='linear'
cache = 200
layers = (64,)

### Baseline

In [None]:
dummy_clf =  DummyClassifier(strategy = "stratified")

In [None]:
dummy_pipe = Pipeline(steps= [('tf_idf', TfidfVectorizer(analyzer='word', stop_words='english')),("dummy",dummy_clf)])

dummy_ovr = OneVsRestClassifier(dummy_pipe)

In [None]:
dummy_ovr.fit(xtrain,ytrain)

OneVsRestClassifier(estimator=Pipeline(steps=[('tf_idf',
                                               TfidfVectorizer(stop_words='english')),
                                              ('dummy',
                                               DummyClassifier(strategy='stratified'))]))

In [None]:
ytest_dummy = dummy_ovr.predict(xtest)
ytrain_dummy = dummy_ovr.predict(xtrain)

In [None]:
print(ytest_dummy.shape)
print(ytrain_dummy.shape)

(3019, 115)
(7775, 115)


In [None]:
np.save(f"{check}/ytrain_dummy.npy",ytrain_dummy)
np.save(f"{check}/ytest_dummy.npy",ytest_dummy)

### Shallow Models

In [None]:
# ## Naive Bayes
nb_tfv = Pipeline(steps = [('tf_idf', TfidfVectorizer(analyzer='word', stop_words = stop_words)),('nb', MultinomialNB()) ]) 

# # # KNN
knn_tfv = Pipeline(steps = [('tf_idf', TfidfVectorizer(analyzer='word', stop_words = stop_words)),('knn', KNeighborsClassifier(n_neighbors=neighbors, n_jobs=jobs))])

# # # SVM
svm_tfv = Pipeline(steps = [('tf_idf', TfidfVectorizer(analyzer='word', stop_words = stop_words)),('svm', SVC(kernel = kernel, cache_size=cache))])

# # ## NNet
# # # Feature scaling
nn_tfv = Pipeline(steps = [('tf_idf', TfidfVectorizer(analyzer='word', stop_words = stop_words)),('nnet',MLPClassifier(hidden_layer_sizes=layers))])

nb = MultinomialNB()
knn = KNeighborsClassifier(n_neighbors=neighbors, n_jobs=2)
svm = SVC(kernel=kernel)
nn = MLPClassifier(hidden_layer_sizes=layers)


In [None]:
from pathlib import Path
check = Path.cwd()/'gdrive/MyDrive/TMProject'

In [None]:
for name, model in zip(['naive_bayes','knn','svm','nn'],[nb,knn,svm,nn]):
    model = OneVsRestClassifier(model).fit(train_tfidf, ytrain)
    y_train_pred = model.predict(train_tfidf)
    print(y_train_pred.shape)
    np.save(f"{check}/{name}_train.npy", y_train_pred)

(7775, 115)
(7775, 115)
(7775, 115)




In [None]:
# nb_tfv.fit(xtrain,ytrain)

In [None]:
pipelines = [nb_tfv,knn_tfv,svm_tfv,nn_tfv]
classifiers = [OneVsRestClassifier(pipeline, n_jobs=-1) for pipeline in pipelines]

In [None]:
for classifier in classifiers:
    classifier.fit(xtrain,ytrain)

In [None]:
for preds in [y_pred_nb, y_pred_knn, y_pred_svm, y_pred_nn]:
    score = f1_score(ytest, preds, average='macro', zero_division=True)
    print(score)

0.2323908092348849
0.22522842565063447
0.49175651853534785
0.39856147306801937


In [None]:
for preds in [y_pred_nb, y_pred_knn, y_pred_svm, y_pred_nn]:
    score = f1_score(ytest, preds, average='micro', zero_division=True)
    print(score)

0.5319773127322511
0.26601896830904465
0.8120529599756506
0.773371104815864


### Creating BERT based features

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

In [None]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModel.from_pretrained('distilbert-base-uncased', num).to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
reuters = load_dataset("reuters21578","ModApte")

Downloading:   0%|          | 0.00/4.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

Downloading and preparing dataset reuters21578/ModApte (download: 7.77 MiB, generated: 12.48 MiB, post-processed: Unknown size, total: 20.25 MiB) to /root/.cache/huggingface/datasets/reuters21578/ModApte/1.0.0/98a2ad6a0242627562db83992f9625261854c40a88619322596153a5a16a206c...


Downloading:   0%|          | 0.00/8.15M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset reuters21578 downloaded and prepared to /root/.cache/huggingface/datasets/reuters21578/ModApte/1.0.0/98a2ad6a0242627562db83992f9625261854c40a88619322596153a5a16a206c. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
reuters.reset_format()

In [None]:
def tokenize(batch):
    return tokenizer(batch['text'], padding= True, truncation=True)

In [None]:
reuters_encoded =  reuters.map(tokenize, batched=True, batch_size=None)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
def forward_pass(batch):
    input_ids = torch.tensor(batch['input_ids']).to(device)
    attention_mask = torch.tensor(batch['attention_mask']).to(device)
    with torch.no_grad():
        last_hidden_state = model(input_ids, attention_mask).last_hidden_state
        last_hidden_state = last_hidden_state.cpu().numpy()
    # Using Average of Unmasked hidden states for classification
    lhs_shape = last_hidden_state.shape
    bool_mask = ~np.array(batch['attention_mask']).astype(bool)
    bool_mask = np.repeat(bool_mask, lhs_shape[-1], axis=-1)
    bool_mask = bool_mask.reshape(lhs_shape)
    masked_mean = np.ma.array(last_hidden_state, mask= bool_mask).mean(axis=1)
    batch['hidden_state'] = masked_mean.data
    return batch

In [None]:
# reuters_encoded = reuters_encoded.map(forward_pass, batched=True, batch_size = 16)

In [None]:
bert_emb_xtrain = np.array(reuters_encoded['train']['hidden_state'])
bert_emb_xtest = np.array(reuters_encoded['test']['hidden_state'])

In [None]:
np.save("bert_embedded_test", bert_emb_xtest)

In [None]:
# topics = pd.Series(reuters['train']['topics'])

### BERT Feature Based Models

- Decision Tree
- Random FOrest
- AdaBoost

In [None]:
xtrain,xtest,ytrain, ytest, empty_train,empty_test = get_data()

Reusing dataset reuters21578 (/root/.cache/huggingface/datasets/reuters21578/ModApte/1.0.0/98a2ad6a0242627562db83992f9625261854c40a88619322596153a5a16a206c)


  0%|          | 0/3 [00:00<?, ?it/s]

  "unknown class(es) {0} will be ignored".format(sorted(unknown, key=str))


In [None]:
xtest_bert = np.load(check/'bert_embedded_test.npy')
xtrain_bert = np.load(check/'bert_embedded_train.npy')

# Filter empty values
xtest_bert = np.delete(xtest_bert, empty_test, 0)
xtrain_bert = np.delete(xtrain_bert, empty_train, 0)

ytrain_dl = torch.from_numpy(ytrain)
ytest_dl = torch.from_numpy(ytest)

In [None]:
# Model Definition
dtree = DecisionTreeClassifier()
rftree = RandomForestClassifier(n_estimators=10)
adaboost = AdaBoostClassifier(n_estimators=10)

# Model wrt Multiclass
clf_dtree = OneVsRestClassifier(dtree)
clf_rf = OneVsRestClassifier(rftree,n_jobs=2)
clf_adab = OneVsRestClassifier(adaboost, n_jobs=2)

# # Model fitting
clf_dtree.fit(xtrain_bert, ytrain)
clf_rf.fit(xtrain_bert, ytrain)
clf_adab.fit(xtrain_bert, ytrain)

# Model Prediction
ytrain_dt = clf_dtree.predict(xtrain_bert)
ytrain_rf = clf_rf.predict(xtrain_bert)
ytrain_adab = clf_adab.predict(xtrain_bert)

# Save Outputs
np.save(f"{check}/ytrain_dt.npy",ytrain_dt)
np.save(f"{check}/ytrain_rf.npy", ytrain_rf)
np.save(f"{check}/ytrain_adab.npy", ytrain_adab)