# 🎯 Learning Objectives

> - Document Classification
> - Text Pre-processing
> - Feature extraction
> - Vocabulary creation

# 📋 Vocabulary & Feature Extraction
> Given a document, you can represent it as a vector of dimension V, where V corresponds to your vocabulary size. As V gets larger, the vector becomes more sparse. Furthermore, we end up having many more features and end up training lot of parameters. This could result in larger training time, and large prediction time.

# 🔨 Preprocessing
When preprocessing, you have to perform the following:
> 1. Eliminate handles and URLs
> 2. Tokenize the string into words
> 3. Remove stop words like "and, is, a, on, etc."
> 4. Stemming - or convert every word to its stem. Like dancer, dancing, danced, becomes 'danc'.
> 5. Convert all your words to lower case.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space'
]

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import Normalizer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

import tensorflow as tf

from tensorflow.keras import layers
import tensorflow_datasets as tfds

In [None]:
data_train = fetch_20newsgroups(subset='train', 
                                categories=categories, 
                                shuffle=True, random_state=42)
n_components = 5
labels = data_train.target
true_k = np.unique(labels).shape[0]

# Convert to TF-IDF format
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english', use_idf=True)
X_train = vectorizer.fit_transform(data_train.data)

# Reduce dimensions
svd = TruncatedSVD(n_components)
normalizer = Normalizer(copy=False)
# lsa = make_pipeline(svd, normalizer)

# X_train = lsa.fit_transform(X_train)

In [None]:
len(data_train.data)

In [None]:
pd.DataFrame(data_train.target).value_counts()

In [None]:
# Order of labels in `target_names` can be different from `categories`
data_test = fetch_20newsgroups(subset='test', 
                               categories=categories, 
                               shuffle=True, random_state=42)

target_names = data_train.target_names

# Split a train set and test set
y_train, y_test = data_train.target, data_test.target

print("Extracting features from the test data using the same vectorizer")
X_test = vectorizer.transform(data_test.data)
# X_test = lsa.fit_transform(X_test)

# 🤖 Machine Learning

# ✔️ Logistic Regression

In [None]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train.toarray(), y_train)

In [None]:
lr_pred = lr_clf.predict(X_train.toarray())
train_score = accuracy_score(y_train, lr_pred) * 100
print(f"Train accuracy score: {train_score:.2f}%")

lr_pred = lr_clf.predict(X_test.toarray())
test_score = accuracy_score(y_test, lr_pred) * 100
print(f"Test accuracy score: {test_score:.2f}%")

In [None]:
# lr_pred = lr_clf.predict(X_test.toarray())
cm = confusion_matrix(y_test, lr_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=data_train.target_names)


# NOTE: Fill all variables here with default values of the plot_confusion_matrix
fig, ax = plt.subplots(figsize=(10, 10))
disp = disp.plot(xticks_rotation='vertical', ax=ax, cmap='summer')

plt.show()

In [None]:
pd.DataFrame(classification_report(y_test, lr_pred, output_dict=True)).T

# ✔️ Naive Bayes

In [None]:
nb_clf = GaussianNB()
nb_clf.fit(X_train.toarray(), y_train)

In [None]:
nb_pred = nb_clf.predict(X_train.toarray())
train_score = accuracy_score(y_train, nb_pred) * 100
print(f"Train accuracy score: {train_score:.2f}%")

nb_pred = nb_clf.predict(X_test.toarray())
test_score = accuracy_score(y_test, nb_pred) * 100
print(f"Test accuracy score: {test_score:.2f}%")

In [None]:
# nb_pred = nb_clf.predict(X_test.toarray())
cm = confusion_matrix(y_test, nb_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=data_train.target_names)


# NOTE: Fill all variables here with default values of the plot_confusion_matrix
fig, ax = plt.subplots(figsize=(10, 10))
disp = disp.plot(xticks_rotation='vertical', ax=ax, cmap='summer')

plt.show()

In [None]:
pd.DataFrame(classification_report(y_test, nb_pred, output_dict=True)).T

# ✔️ Support Vector Machine

In [None]:
svm_clf = SVC()
svm_clf.fit(X_train.toarray(), y_train)

In [None]:
svm_pred = svm_clf.predict(X_train.toarray())
train_score = accuracy_score(y_train, svm_pred) * 100
print(f"Train accuracy score: {train_score:.2f}%")

svm_pred = svm_clf.predict(X_test.toarray())
test_score = accuracy_score(y_test, svm_pred) * 100
print(f"Test accuracy score: {test_score:.2f}%")

In [None]:
svm_pred = svm_clf.predict(X_test.toarray())
cm = confusion_matrix(y_test, svm_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=data_train.target_names)


# NOTE: Fill all variables here with default values of the plot_confusion_matrix
fig, ax = plt.subplots(figsize=(10, 10))
disp = disp.plot(xticks_rotation='vertical', ax=ax, cmap='summer')

plt.show()

In [None]:
pd.DataFrame(classification_report(y_test, svm_pred, output_dict=True)).T

# ✔️ Convolutional Neural Networks - CNNs

In [None]:
data_train = fetch_20newsgroups(subset='train', 
                                categories=categories, 
                                shuffle=True, random_state=42)

data_test = fetch_20newsgroups(subset='test', 
                               categories=categories, 
                               shuffle=True, random_state=42)
X_train = data_train.data
y_train = data_train.target

X_test = data_test.data
y_test = data_test.target

assert(len(X_train) == len(y_train))
assert(len(X_test) == len(y_test))

## Tokenization

In [None]:
tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    X_train, target_vocab_size=2**18
)

train_inputs = [tokenizer.encode(text) for text in X_train]
test_inputs = [tokenizer.encode(text) for text in X_test]

## Padding

In [None]:
MAX_LEN = max([len(sentence) for sentence in data_inputs])
train_inputs = tf.keras.preprocessing.sequence.pad_sequences(train_inputs,
                                                             value=0,
                                                             padding="post",
                                                             maxlen=MAX_LEN)

test_inputs = tf.keras.preprocessing.sequence.pad_sequences(test_inputs,
                                                            value=0,
                                                            padding="post",
                                                            maxlen=MAX_LEN)

In [None]:
class DCNN(tf.keras.Model):
    
    def __init__(self, vocab_size, emb_dim=128, nb_filters=50, FFN_units=512, nb_classes=2,
                 dropout_rate=0.1, training=False, name="dcnn"):
        super(DCNN, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocab_size, emb_dim)
        self.bigram = layers.Conv1D(filters=nb_filters, kernel_size=2, padding="valid", activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters, kernel_size=3, padding="valid", activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters, kernel_size=4, padding="valid", activation="relu")
        self.pool = layers.GlobalMaxPool1D() # no training variable so we can
                                             # use the same layer for each
                                             # pooling step
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1, activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes, activation="softmax")
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x)
        x_1 = self.pool(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

In [None]:
VOCAB_SIZE = tokenizer.vocab_size

EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = len(set(y_train))

DROPOUT_RATE = 0.2

BATCH_SIZE = 32
NB_EPOCHS = 5

Dcnn = DCNN(vocab_size=VOCAB_SIZE, emb_dim=EMB_DIM, nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS, nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [None]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [None]:
checkpoint_path = "./drive/My Drive/projects/CNN_for_NLP/ckpt/"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

In [None]:
Dcnn.fit(train_inputs,
         y_train,
         batch_size=BATCH_SIZE,
         epochs=NB_EPOCHS)
ckpt_manager.save()

In [None]:
results = Dcnn.evaluate(test_inputs, y_test, batch_size=BATCH_SIZE)
print(results)