## Bag of Words

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import classification_report
from sklearn.datasets import fetch_20newsgroups

In [2]:
# Defining search categories

documents_train = fetch_20newsgroups(
    subset = "train",
    categories = ["comp.graphics", "comp.windows.x", "rec.autos", "sci.space"]
    
)

documents_test = fetch_20newsgroups(
    subset = "test",
    categories = ["comp.graphics", "comp.windows.x", "rec.autos", "sci.space"]
    
)

In [3]:
print(documents_train.data[0])
print(documents_train.target[0])
print("target names", documents_train.target_names)

From: orourke@sophia.smith.edu (Joseph O'Rourke)
Subject: Re: Delaunay Triangulation
Organization: Smith College, Northampton, MA, US
Lines: 22

In article <lsk1v9INN93c@caspian.usc.edu> zyeh@caspian.usc.edu (zhenghao yeh) writes:
>
>Does anybody know what Delaunay Triangulation is?
>Is there any reference to it? 
>Is it useful for creating 3-D objects? If yes, what's the advantage?

There is a vast literature on Delaunay triangulations, literally
hundreds of papers.  A program is even provided with every copy of 
Mathematica nowadays.  You might look at this if you are interested in 
using it for creating 3D objects:

@article{Boissonnat5,
  author = "J.D. Boissonnat",
  title = "Geometric Structures for Three-Dimensional Shape Representation",
  journal = "ACM Transactions on Graphics",
  month = "October",
  year = {1984},
  volume = {3},
  number = {4},
  pages = {266-286}
}


0
target names ['comp.graphics', 'comp.windows.x', 'rec.autos', 'sci.space']


In [11]:
# Term Frequency-Inverse Document Frequency.
text_vectorizer = layers.TextVectorization(
    max_tokens = 5000,
    output_mode = "tf_idf"
)

text_vectorizer.adapt(documents_train.data)

X_train = text_vectorizer(documents_train.data)
X_test = text_vectorizer(documents_test.data)

In [13]:
X_train

<tf.Tensor: shape=(2364, 5000), dtype=float32, numpy=
array([[ 78.40026  ,   0.7299776,   0.765898 , ...,   0.       ,
          0.       ,   0.       ],
       [ 78.40026  ,   0.7299776,   2.297694 , ...,   0.       ,
          0.       ,   0.       ],
       [ 27.670681 ,   3.649888 ,   1.531796 , ...,   0.       ,
          0.       ,   0.       ],
       ...,
       [ 59.95314  ,   5.1098433,   1.531796 , ...,   0.       ,
          0.       ,   0.       ],
       [156.80052  ,  18.979418 ,   7.65898  , ...,   0.       ,
          0.       ,   0.       ],
       [152.18875  ,   8.759731 ,   5.361286 , ...,   0.       ,
          0.       ,   0.       ]], dtype=float32)>

In [14]:
mdl = keras.Sequential()
mdl.add(layers.Dense(256, activation = "relu", input_shape = (X_train.shape[1], )))
mdl.add(layers.Dense(128, activation = "relu"))
mdl.add(layers.Dense(64, activation = "relu"))
mdl.add(layers.Dense(4, activation = "softmax"))

mdl.compile(optimizer = "adam", loss= "sparse_categorical_crossentropy", metrics = ["accuracy"])

In [21]:
mdl.fit(X_train, documents_train.target, epochs = 1)
y_pred = mdl.predict(X_test)

[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 1.0000 - loss: 5.3866e-05
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


In [22]:
print(classification_report(documents_test.target, np.argmax(y_pred, axis = 1)))

              precision    recall  f1-score   support

           0       0.80      0.85      0.82       389
           1       0.90      0.83      0.86       395
           2       0.97      0.95      0.96       396
           3       0.92      0.95      0.94       394

    accuracy                           0.90      1574
   macro avg       0.90      0.90      0.90      1574
weighted avg       0.90      0.90      0.90      1574

