# Bag-of-words document classification

What will happen on Reuters?

In [30]:
import json
import random
random.seed(0)
with open("data/reuters_51cls.json") as f:
    data=json.load(f)
random.shuffle(data) #play it safe!
print(data[0]) #Every item is a dictionary with `text` and `class` keys, here's one:

{'text': '&#2;\nUNITED COMPANIES &lt;UNCF> DECLARES STOCK DIVIDEND\nBATON ROUGE, La, March 6 - United Companies Financial Corp\nsaid its board declared a two pct stock dividend payable APril\neight to holders of record March 17.\nThe board also declared a regular quarterly cash dividend\nof 12.5 cts payable April one to holders of record March 16.\nReuter\n&#3;', 'class': 'earn'}


In [31]:
# We need to gather the texts, into a list
texts=[one_example["text"] for one_example in data]
labels=[one_example["class"] for one_example in data]
print(texts[:2])
print(labels[:2])

['&#2;\nUNITED COMPANIES &lt;UNCF> DECLARES STOCK DIVIDEND\nBATON ROUGE, La, March 6 - United Companies Financial Corp\nsaid its board declared a two pct stock dividend payable APril\neight to holders of record March 17.\nThe board also declared a regular quarterly cash dividend\nof 12.5 cts payable April one to holders of record March 16.\nReuter\n&#3;', '&#2;\nCANBRA FOODS SETS SPECIAL FIVE DLR/SHR PAYOUT\nLETHBRIDGE, Alberta, March 16 - &lt;Canbra Foods Ltd>, earlier\nreporting a 1986 net profit against a year-ago loss, said it\ndeclared a special, one-time dividend of five dlrs per common\nshare, pay March 31, record March 26.\nCanbra said it set the special payout to allow shareholders\nto participate in the gain on the sale of unit Stafford Foods\nLtd in November, 1986, as well as the company\'s "unusually\nprofitable performance" in 1986.\nCanbra earlier reported 1986 net earnings of 4.2 mln dlrs,\nexcluding a 1.3 mln dlr gain on the Stafford sale, compared to\na year-ago loss o

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


vectorizer=CountVectorizer(max_features=100000,binary=True,ngram_range=(1,2))
feature_matrix=vectorizer.fit_transform(texts)
print("shape=",feature_matrix.shape)
#print(feature_matrix.todense())




shape= (9465, 100000)


Now we have the feature matrix done! Next thing we need is the class labels to be predicted in one-hot encoding. This means:

* one row for every example
* one column for every possible class label
* exactly one column has 1 for every example, corresponding to the desired class

In [33]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

label_encoder=LabelEncoder() #Turns class labels into integers
one_hot_encoder=OneHotEncoder(sparse=False) #Turns class integers into one-hot encoding
class_numbers=label_encoder.fit_transform(labels)
print("class_numbers shape=",class_numbers.shape)
print("class_numbers",class_numbers)
print("class labels",label_encoder.classes_)
#And now yet the one-hot encoding
classes_1hot=one_hot_encoder.fit_transform(class_numbers.reshape(-1,1))
print("classes_1hot",classes_1hot)

class_numbers shape= (9465,)
class_numbers [11 11 11 ... 11  0  0]
class labels ['acq' 'alum' 'bop' 'carcass' 'cocoa' 'coffee' 'copper' 'cotton' 'cpi'
 'crude' 'dlr' 'earn' 'fuel' 'gas' 'gnp' 'gold' 'grain' 'heat' 'housing'
 'income' 'instal-debt' 'interest' 'ipi' 'iron-steel' 'jobs' 'lead' 'lei'
 'livestock' 'lumber' 'meal-feed' 'money-fx' 'money-supply' 'nat-gas'
 'oilseed' 'orange' 'pet-chem' 'potato' 'reserves' 'retail' 'rubber'
 'ship' 'silver' 'strategic-metal' 'sugar' 'tea' 'tin' 'trade' 'veg-oil'
 'wpi' 'yen' 'zinc']
classes_1hot [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


In [34]:
import h5py
from keras.models import Model
from keras.layers import Input, Dense
from keras.callbacks import ModelCheckpoint

def save_model(file_name,model,label_encoder,vectorizer):
    """Saves model structure and vocabularies"""
    model_json = model.to_json()
    with open(file_name+".model.json", "w") as f:
        print(model_json,file=f)
    with open(file_name+".vocabularies.json","w") as f:
        classes=list(label_encoder.classes_)
        vocab=dict(((str(w),int(idx)) for w,idx in vectorizer.vocabulary_.items()))
        json.dump((classes,vocab),f,indent=2)
        
example_count,feature_count=feature_matrix.shape
example_count,class_count=classes_1hot.shape

inp=Input(shape=(feature_count,))
hidden=Dense(200,activation="tanh")(inp)
outp=Dense(class_count,activation="softmax")(hidden)
model=Model(inputs=[inp], outputs=[outp])
model.compile(optimizer="sgd",loss="categorical_crossentropy",metrics=['accuracy'])

# Save model and vocabularies
save_model("models/reuters_51cls_bow",model,label_encoder,vectorizer)
# Callback function to save weights during training
save_cb=ModelCheckpoint(filepath="models/reuters_51cls_bow.weights.h5", monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
hist=model.fit(feature_matrix,classes_1hot,batch_size=100,verbose=1,epochs=30,validation_split=0.1,callbacks=[save_cb])


Train on 8518 samples, validate on 947 samples
Epoch 1/30

Epoch 00001: val_loss improved from inf to 1.76240, saving model to models/reuters_51cls_bow.weights.h5
Epoch 2/30

Epoch 00002: val_loss improved from 1.76240 to 1.49023, saving model to models/reuters_51cls_bow.weights.h5
Epoch 3/30

Epoch 00003: val_loss improved from 1.49023 to 1.36283, saving model to models/reuters_51cls_bow.weights.h5
Epoch 4/30

Epoch 00004: val_loss improved from 1.36283 to 1.27699, saving model to models/reuters_51cls_bow.weights.h5
Epoch 5/30

Epoch 00005: val_loss improved from 1.27699 to 1.20868, saving model to models/reuters_51cls_bow.weights.h5
Epoch 6/30

Epoch 00006: val_loss improved from 1.20868 to 1.15206, saving model to models/reuters_51cls_bow.weights.h5
Epoch 7/30

Epoch 00007: val_loss improved from 1.15206 to 1.10322, saving model to models/reuters_51cls_bow.weights.h5
Epoch 8/30

Epoch 00008: val_loss improved from 1.10322 to 1.05988, saving model to models/reuters_51cls_bow.weights.

In [48]:
import numpy
from sklearn.metrics import classification_report, confusion_matrix

#Validation data used during training:
val_instances,val_labels_1hot,_=hist.validation_data

print("Network output=",model.predict(val_instances))
predictions=numpy.argmax(model.predict(val_instances),axis=1)
print("Maximum class for each example=",predictions)
gold=numpy.nonzero(val_labels_1hot)[1] #undo 1-hot encoding
conf_matrix=confusion_matrix(list(gold),list(predictions))
print(conf_matrix)
print(classification_report(list(gold),list(predictions),target_names=label_encoder.classes_))

Network output= [[2.0972650e-01 1.2842861e-02 9.6099349e-03 ... 6.1267391e-03
  6.0931095e-03 5.6801271e-03]
 [1.0681288e-04 3.5428602e-06 3.7730745e-06 ... 2.1082194e-06
  1.9002957e-06 1.1053662e-06]
 [5.9391409e-01 7.4532600e-03 4.1777967e-03 ... 3.3430997e-03
  4.1454947e-03 2.9960810e-03]
 ...
 [3.2259982e-02 1.8379733e-03 9.8252657e-04 ... 7.1975170e-04
  8.0373778e-04 5.8117666e-04]
 [9.9498838e-01 7.3048315e-05 3.5937352e-05 ... 1.7827695e-05
  2.7510325e-05 2.1379099e-05]
 [9.9085873e-01 1.3929917e-04 7.2065945e-05 ... 3.2477805e-05
  5.5748078e-05 4.0438361e-05]]
Maximum class for each example= [ 0 11  0 43  0  0 11 11 11  9  0  0 11  0 11  9 11  0  0 11  0 11  9 21
 46  9 11 11  0 30 21 11  8 40 46  0 11  0 11 30 11  5 11  0 11 11  0 11
 11 11  0  0  9 11 11 21  0  8 11 46  0 11  0 11 11 11 21 30 31 46 37 11
 11  0 11 46 11 21  0 11  0 40 11 40  0 21 11  8 11 11 11 11 11  0 11 11
 46  0 11  4 11  0 11  0 21 11  0  0 11 11  0 21  0  0  0  0 46 11 46  8
 11 46  0  0 46 11  0 2

  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)
