## Text Classification Methods in NLP using Deep Learning
#### Using pre-trained word embeddings 

In [11]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

In [12]:
data_path = keras.utils.get_file(
    "news20.tar.gz",
    "http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz",
    untar=True,
)

Downloading data from http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz


In [13]:
import os
import pathlib

data_dir = pathlib.Path(data_path).parent / "20_newsgroup"
dirnames = os.listdir(data_dir)
print("Number of directories:", len(dirnames))
print("Directory names:", dirnames)

Number of directories: 20
Directory names: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [14]:
samples = []
labels = []
class_names = []
class_index = 0
for dirname in sorted(os.listdir(data_dir)):
    class_names.append(dirname)
    dirpath = data_dir / dirname
    fnames = os.listdir(dirpath)
    print("Processing %s, %d files found" % (dirname, len(fnames)))
    for fname in fnames:
        fpath = dirpath / fname
        f = open(fpath, encoding="latin-1")
        content = f.read()
        lines = content.split("\n")
        lines = lines[10:]
        content = "\n".join(lines)
        samples.append(content)
        labels.append(class_index)
    class_index += 1

print("Classes:", class_names)
print("Number of samples:", len(samples))

Processing alt.atheism, 1000 files found
Processing comp.graphics, 1000 files found
Processing comp.os.ms-windows.misc, 1000 files found
Processing comp.sys.ibm.pc.hardware, 1000 files found
Processing comp.sys.mac.hardware, 1000 files found
Processing comp.windows.x, 1000 files found
Processing misc.forsale, 1000 files found
Processing rec.autos, 1000 files found
Processing rec.motorcycles, 1000 files found
Processing rec.sport.baseball, 1000 files found
Processing rec.sport.hockey, 1000 files found
Processing sci.crypt, 1000 files found
Processing sci.electronics, 1000 files found
Processing sci.med, 1000 files found
Processing sci.space, 1000 files found
Processing soc.religion.christian, 997 files found
Processing talk.politics.guns, 1000 files found
Processing talk.politics.mideast, 1000 files found
Processing talk.politics.misc, 1000 files found
Processing talk.religion.misc, 1000 files found
Classes: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.ha

In [16]:
# Shuffle the data
seed = 1337
rng = np.random.RandomState(seed)
rng.shuffle(samples)
rng = np.random.RandomState(seed)
rng.shuffle(labels)

# Extract a training & validation split
validation_split = 0.2
num_validation_samples = int(validation_split * len(samples))
train_samples = samples[:-num_validation_samples]
val_samples = samples[-num_validation_samples:]
train_labels = labels[:-num_validation_samples]
val_labels = labels[-num_validation_samples:]

In [17]:
#creating vocabulary index 
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(128)
vectorizer.adapt(text_ds)

In [18]:
vectorizer.get_vocabulary()[:5]

['', '[UNK]', 'the', 'to', 'of']

In [19]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [21]:
#load pretrained word embeddings 
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

--2021-03-28 21:00:20--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-03-28 21:00:20--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-03-28 21:00:20--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: 'glove.6B.zip'

     0K

 12050K .......... .......... .......... .......... ..........  1% 5.05M 3m27s
 12100K .......... .......... .......... .......... ..........  1% 2.75M 3m27s
 12150K .......... .......... .......... .......... ..........  1% 12.3M 3m27s
 12200K .......... .......... .......... .......... ..........  1% 4.91M 3m26s
 12250K .......... .......... .......... .......... ..........  1% 5.23M 3m26s
 12300K .......... .......... .......... .......... ..........  1% 6.08M 3m26s
 12350K .......... .......... .......... .......... ..........  1% 6.06M 3m26s
 12400K .......... .......... .......... .......... ..........  1% 6.54M 3m25s
 12450K .......... .......... .......... .......... ..........  1% 2.20M 3m26s
 12500K .......... .......... .......... .......... ..........  1% 4.36M 3m26s
 12550K .......... .......... .......... .......... ..........  1% 5.64M 3m25s
 12600K .......... .......... .......... .......... ..........  1% 3.95M 3m25s
 12650K .......... .......... .......... .......... 


 49200K .......... .......... .......... .......... ..........  5% 2.13M 3m13s
 49250K .......... .......... .......... .......... ..........  5%  162M 3m13s
 49300K .......... .......... .......... .......... ..........  5% 4.78M 3m13s
 49350K .......... .......... .......... .......... ..........  5% 95.1M 3m13s
 49400K .......... .......... .......... .......... ..........  5% 2.09M 3m13s
 49450K .......... .......... .......... .......... ..........  5% 2.50M 3m13s
 49500K .......... .......... .......... .......... ..........  5% 6.35M 3m13s
 49550K .......... .......... .......... .......... ..........  5% 4.21M 3m13s
 49600K .......... .......... .......... .......... ..........  5% 6.14M 3m13s
 49650K .......... .......... .......... .......... ..........  5% 7.98M 3m13s
 49700K .......... .......... .......... .......... ..........  5% 2.54M 3m13s
 49750K .......... .......... .......... .......... ..........  5%  130M 3m13s
 49800K .......... .......... .......... ..........

173900K .......... .......... .......... .......... .......... 20% 1.05M 2m28s
173950K .......... .......... .......... .......... .......... 20% 5.46M 2m28s
174000K .......... .......... .......... .......... .......... 20% 4.03M 2m28s
174050K .......... .......... .......... .......... .......... 20% 5.93M 2m28s
174100K .......... .......... .......... .......... .......... 20% 6.49M 2m28s
174150K .......... .......... .......... .......... .......... 20% 6.27M 2m28s
174200K .......... .......... .......... .......... .......... 20% 6.24M 2m28s
174250K .......... .......... .......... .......... .......... 20% 5.77M 2m27s
174300K .......... .......... .......... .......... .......... 20% 2.51M 2m28s
174350K .......... .......... .......... .......... .......... 20% 7.25M 2m27s
174400K .......... .......... .......... .......... .......... 20% 4.70M 2m27s
174450K .......... .......... .......... .......... .......... 20% 6.19M 2m27s
174500K .......... .......... .......... .......... 

195000K .......... .......... .......... .......... .......... 23% 5.17M 2m22s
195050K .......... .......... .......... .......... .......... 23% 4.25M 2m22s
195100K .......... .......... .......... .......... .......... 23% 3.33M 2m22s
195150K .......... .......... .......... .......... .......... 23% 5.64M 2m22s
195200K .......... .......... .......... .......... .......... 23% 6.87M 2m22s
195250K .......... .......... .......... .......... .......... 23% 5.72M 2m22s
195300K .......... .......... .......... .......... .......... 23% 5.88M 2m22s
195350K .......... .......... .......... .......... .......... 23% 5.57M 2m22s
195400K .......... .......... .......... .......... .......... 23% 6.56M 2m22s
195450K .......... .......... .......... .......... .......... 23% 6.18M 2m22s
195500K .......... .......... .......... .......... .......... 23% 6.75M 2m22s
195550K .......... .......... .......... .......... .......... 23% 5.78M 2m22s
195600K .......... .......... .......... .......... 

299000K .......... .......... .......... .......... .......... 35% 4.32M 1m58s
299050K .......... .......... .......... .......... .......... 35% 4.02M 1m58s
299100K .......... .......... .......... .......... .......... 35% 6.98M 1m58s
299150K .......... .......... .......... .......... .......... 35% 3.88M 1m58s
299200K .......... .......... .......... .......... .......... 35% 7.32M 1m58s
299250K .......... .......... .......... .......... .......... 35% 6.59M 1m58s
299300K .......... .......... .......... .......... .......... 35% 5.74M 1m58s
299350K .......... .......... .......... .......... .......... 35% 3.63M 1m58s
299400K .......... .......... .......... .......... .......... 35% 5.53M 1m58s
299450K .......... .......... .......... .......... .......... 35% 5.92M 1m58s
299500K .......... .......... .......... .......... .......... 35% 7.22M 1m58s
299550K .......... .......... .......... .......... .......... 35% 4.23M 1m58s
299600K .......... .......... .......... .......... 

324000K .......... .......... .......... .......... .......... 38% 2.29M 1m55s
324050K .......... .......... .......... .......... .......... 38% 4.28M 1m55s
324100K .......... .......... .......... .......... .......... 38% 1.47M 1m55s
324150K .......... .......... .......... .......... .......... 38% 1.32M 1m55s
324200K .......... .......... .......... .......... .......... 38% 4.15M 1m55s
324250K .......... .......... .......... .......... .......... 38% 4.25M 1m55s
324300K .......... .......... .......... .......... .......... 38% 4.03M 1m55s
324350K .......... .......... .......... .......... .......... 38% 1.41M 1m55s
324400K .......... .......... .......... .......... .......... 38% 4.36M 1m55s
324450K .......... .......... .......... .......... .......... 38% 3.92M 1m55s
324500K .......... .......... .......... .......... .......... 38% 4.05M 1m55s
324550K .......... .......... .......... .......... .......... 38% 6.37M 1m55s
324600K .......... .......... .......... .......... 

348950K .......... .......... .......... .......... .......... 41% 6.69M 1m51s
349000K .......... .......... .......... .......... .......... 41% 5.16M 1m51s
349050K .......... .......... .......... .......... .......... 41% 2.94M 1m51s
349100K .......... .......... .......... .......... .......... 41% 4.07M 1m51s
349150K .......... .......... .......... .......... .......... 41% 4.97M 1m51s
349200K .......... .......... .......... .......... .......... 41% 5.52M 1m51s
349250K .......... .......... .......... .......... .......... 41% 6.74M 1m51s
349300K .......... .......... .......... .......... .......... 41% 6.40M 1m51s
349350K .......... .......... .......... .......... .......... 41% 4.86M 1m51s
349400K .......... .......... .......... .......... .......... 41% 6.60M 1m51s
349450K .......... .......... .......... .......... .......... 41% 5.93M 1m51s
349500K .......... .......... .......... .......... .......... 41% 5.99M 1m51s
349550K .......... .......... .......... .......... 

473900K .......... .......... .......... .......... .......... 56% 5.96M 83s
473950K .......... .......... .......... .......... .......... 56% 1.91M 83s
474000K .......... .......... .......... .......... .......... 56% 5.66M 83s
474050K .......... .......... .......... .......... .......... 56% 5.62M 83s
474100K .......... .......... .......... .......... .......... 56% 5.92M 83s
474150K .......... .......... .......... .......... .......... 56% 6.34M 83s
474200K .......... .......... .......... .......... .......... 56% 5.29M 83s
474250K .......... .......... .......... .......... .......... 56% 7.43M 83s
474300K .......... .......... .......... .......... .......... 56% 5.91M 83s
474350K .......... .......... .......... .......... .......... 56% 6.88M 83s
474400K .......... .......... .......... .......... .......... 56% 5.53M 83s
474450K .......... .......... .......... .......... .......... 56% 3.68M 83s
474500K .......... .......... .......... .......... .......... 56% 4.32M 83s

498900K .......... .......... .......... .......... .......... 59% 5.61M 77s
498950K .......... .......... .......... .......... .......... 59% 7.31M 77s
499000K .......... .......... .......... .......... .......... 59% 6.39M 77s
499050K .......... .......... .......... .......... .......... 59% 1.83M 77s
499100K .......... .......... .......... .......... .......... 59% 1.84M 77s
499150K .......... .......... .......... .......... .......... 59% 6.03M 77s
499200K .......... .......... .......... .......... .......... 59% 5.55M 77s
499250K .......... .......... .......... .......... .......... 59% 4.15M 77s
499300K .......... .......... .......... .......... .......... 59% 3.67M 77s
499350K .......... .......... .......... .......... .......... 59% 3.79M 77s
499400K .......... .......... .......... .......... .......... 59% 3.53M 77s
499450K .......... .......... .......... .......... .......... 59% 1.53M 77s
499500K .......... .......... .......... .......... .......... 59%  643K 77s

523850K .......... .......... .......... .......... .......... 62% 7.34M 71s
523900K .......... .......... .......... .......... .......... 62% 3.81M 71s
523950K .......... .......... .......... .......... .......... 62% 2.12M 71s
524000K .......... .......... .......... .......... .......... 62% 4.59M 71s
524050K .......... .......... .......... .......... .......... 62% 3.04M 71s
524100K .......... .......... .......... .......... .......... 62% 5.68M 71s
524150K .......... .......... .......... .......... .......... 62% 3.98M 71s
524200K .......... .......... .......... .......... .......... 62% 2.33M 71s
524250K .......... .......... .......... .......... .......... 62% 4.73M 71s
524300K .......... .......... .......... .......... .......... 62% 3.61M 71s
524350K .......... .......... .......... .......... .......... 62% 10.1M 71s
524400K .......... .......... .......... .......... .......... 62% 7.32M 71s
524450K .......... .......... .......... .......... .......... 62% 6.06M 71s

673850K .......... .......... .......... .......... .......... 80% 4.77M 39s
673900K .......... .......... .......... .......... .......... 80% 5.97M 39s
673950K .......... .......... .......... .......... .......... 80% 8.53M 39s
674000K .......... .......... .......... .......... .......... 80% 2.09M 39s
674050K .......... .......... .......... .......... .......... 80%  104M 39s
674100K .......... .......... .......... .......... .......... 80% 3.09M 39s
674150K .......... .......... .......... .......... .......... 80% 7.41M 39s
674200K .......... .......... .......... .......... .......... 80% 4.72M 39s
674250K .......... .......... .......... .......... .......... 80% 5.88M 39s
674300K .......... .......... .......... .......... .......... 80% 5.93M 39s
674350K .......... .......... .......... .......... .......... 80% 5.37M 39s
674400K .......... .......... .......... .......... .......... 80% 6.62M 39s
674450K .......... .......... .......... .......... .......... 80% 6.56M 39s

709150K .......... .......... .......... .......... .......... 84% 5.95M 30s
709200K .......... .......... .......... .......... .......... 84% 1.59M 30s
709250K .......... .......... .......... .......... .......... 84% 6.46M 30s
709300K .......... .......... .......... .......... .......... 84% 5.84M 30s
709350K .......... .......... .......... .......... .......... 84% 4.58M 30s
709400K .......... .......... .......... .......... .......... 84% 6.38M 30s
709450K .......... .......... .......... .......... .......... 84% 5.59M 30s
709500K .......... .......... .......... .......... .......... 84% 7.35M 30s
709550K .......... .......... .......... .......... .......... 84% 5.28M 30s
709600K .......... .......... .......... .......... .......... 84% 2.37M 30s
709650K .......... .......... .......... .......... .......... 84%  111M 30s
709700K .......... .......... .......... .......... .......... 84% 79.9M 30s
709750K .......... .......... .......... .......... .......... 84% 4.38M 30s

In [24]:
#making a dict mapping words (strings) to their NumPy vector representation
path_to_glove_file = os.path.join(
    os.path.expanduser("~"), ".keras/datasets/glove.6B.100d.txt"
)

embeddings_index = {}
with open(path_to_glove_file, encoding="utf8") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [25]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 18019 words (1981 misses)


In [26]:
#loading the pre-trained word embeddings matrix into an Embedding layer
from tensorflow.keras.layers import Embedding

embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

In [27]:
#building a simple 1D convnet with global max pooling and a classifier at the end
from tensorflow.keras import layers

int_sequences_input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
x = layers.Conv1D(128, 5, activation="relu")(embedded_sequences)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)
preds = layers.Dense(len(class_names), activation="softmax")(x)
model = keras.Model(int_sequences_input, preds)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 100)         2000200   
_________________________________________________________________
conv1d (Conv1D)              (None, None, 128)         64128     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, None, 128)         0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 128)         82048     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, None, 128)         0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, None, 128)         82048 

In [28]:
#model training 
#convert our list-of-strings data to NumPy arrays of integer indices. The arrays are right-padded
x_train = vectorizer(np.array([[s] for s in train_samples])).numpy()
x_val = vectorizer(np.array([[s] for s in val_samples])).numpy()

y_train = np.array(train_labels)
y_val = np.array(val_labels)

In [29]:
#we use sparse_categorical_crossentropy since our labels are integers and we use softmax classification 
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="rmsprop", metrics=["acc"]
)
model.fit(x_train, y_train, batch_size=128, epochs=20, validation_data=(x_val, y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x2113404ec70>

In [68]:
full_samples = vectorizer(np.array([[s] for s in samples])).numpy()
preds = model.predict(full_samples)

In [65]:
#clustering 
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import normalized_mutual_info_score as nmi_score 
import statistics 

In [66]:
km = KMeans(n_clusters = 20, init="random", n_init=20)

In [71]:
km.fit_predict(preds)

array([ 3, 17, 11, ..., 10,  5,  0])

In [72]:
print(nmi_score(labels,km.labels_))

0.8578260737102869
