This recipe follows initially the scikit-learn tutorial on working with text data: https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [27]:
from sklearn.datasets import fetch_20newsgroups

In [28]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(
    subset='train',
    categories=categories,
    shuffle=True,
    random_state=42
  )

It's a small dataset

In [6]:
len(twenty_train.filenames)

2257

In [7]:
print("\n".join(twenty_train.data[0].split("\n")[:3]))

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton


# Using a bag-of-words approach with a classifier

In [30]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier

text_clf = Pipeline([
  ('vect', CountVectorizer()),
  ('tfidf', TfidfTransformer()),
  ('clf', RandomForestClassifier()),
])

In [31]:
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                

In [32]:
import numpy as np
twenty_test = fetch_20newsgroups(
    subset='test',
    categories=categories,
    shuffle=True,
    random_state=42
)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.8055925432756325

# Using a word embedding with a classifier

In [2]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [2]:
!pip install wget
import wget
wget.download(
    'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.vec',
    '/gdrive/My Drive/embeddings/wiki.en.vec'
)



'/gdrive/My Drive/embeddings/wiki.en.vec'

In [5]:
!ls /gdrive/My\ Drive/embeddings/wiki.en.vec

'/gdrive/My Drive/embeddings/wiki.en.vec'


In [10]:
from gensim.models import KeyedVectors
from gensim.models.fasttext import load_facebook_vectors()


model = KeyedVectors.load_word2vec_format(
    '/gdrive/My Drive/embeddings/wiki.en.vec',
    binary=False, encoding='utf8'
)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [62]:
import numpy as np
from tensorflow.keras.preprocessing.text import text_to_word_sequence

def embed_text(text: str):
  vector_list = [
    model.wv[w].reshape(-1, 1) for w in text_to_word_sequence(text)
    if w in model.wv
  ]
  if len(vector_list) > 0:
    return np.mean(
        np.concatenate(vector_list, axis=1),
        axis=1
    ).reshape(1, 300)
  else:
    return np.zeros(shape=(1, 300))


embed_text('training run').shape

  import sys


(1, 300)

In [63]:
train_transformed = np.concatenate(
    [embed_text(t) for t in twenty_train.data]
)

  import sys


In [64]:
train_transformed.shape

(2257, 300)

In [65]:
rf = RandomForestClassifier().fit(train_transformed, twenty_train.target)

In [66]:
test_transformed = np.concatenate(
    [embed_text(t) for t in twenty_test.data]
)

  import sys


In [67]:
predicted = rf.predict(test_transformed)
np.mean(predicted == twenty_test.target)

0.8621837549933422

# Keras model with embedding layer

In [90]:
from tensorflow.keras import layers

embedding = layers.Embedding(
    input_dim=5000, 
    output_dim=50, 
    input_length=500
)

In [72]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(twenty_train.data)

In [74]:
X_train = tokenizer.texts_to_sequences(twenty_train.data)
X_test = tokenizer.texts_to_sequences(twenty_test.data)

In [88]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train = pad_sequences(X_train, padding='post', maxlen=500)
X_test = pad_sequences(X_test, padding='post', maxlen=500)

In [106]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras import regularizers

model = Sequential()
model.add(embedding)
model.add(layers.Flatten())
model.add(layers.Dense(
    10,
    activation='relu',
    kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4)
))
model.add(layers.Dense(len(categories), activation='softmax'))
model.compile(optimizer='adam',
              loss=SparseCategoricalCrossentropy(),
              metrics=['accuracy'])
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 500, 50)           250000    
_________________________________________________________________
flatten_9 (Flatten)          (None, 25000)             0         
_________________________________________________________________
dense_18 (Dense)             (None, 10)                250010    
_________________________________________________________________
dense_19 (Dense)             (None, 4)                 44        
Total params: 500,054
Trainable params: 500,054
Non-trainable params: 0
_________________________________________________________________


In [107]:
model.fit(X_train, twenty_train.target, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fe77a336748>

In [108]:
predicted = model.predict(X_test).argmax(axis=1)
np.mean(predicted == twenty_test.target)

0.9027962716378163

# There's more...

For more difficult problems we can use stacked conv1d layers on top of the embedding, for example:

In [None]:
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)

## Use pretrained word embeddings in a keras model

In [68]:
word_index = {i: w for i, w in enumerate(model.wv.vocab.keys())}

  """Entry point for launching an IPython kernel.


In [50]:
model.wv.vectors.shape

  """Entry point for launching an IPython kernel.


(2519370, 300)

In [69]:
from tensorflow.keras.layers import Embedding

embedding_layer = Embedding(
    len(word_index) + 1,
    300,
    weights=[list(model.wv.vectors)],
    input_length=500,
    trainable=False
)

  """
