## USING VECTOR SPACE MODEL AND COSINE SIMILARITY FOR TEXT CLASSIFICATION

### CONSTRUCT THE VECTOR SPACE MODEL (PREPROCESS THE TEXT, CALCULATE THE BAG OF WORDS AND TF-IDF) AND COMPUTE THE IMPORTANCE OF THE WORD CHINESE IN THE TEST DATA

In [None]:
import tensorflow as tf
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np

In [None]:
train_documents = [
    "Chinese Beijing Chinese",
    "Chinese Chinese Shanghai",
    "Chinese Macao",
    "Tokyo Japan Chinese"
]

test_document = "Chinese Chinese Chinese Tokyo Japan"

In [None]:
vectorizer_bow = CountVectorizer()
X_bow = vectorizer_bow.fit_transform(train_documents)
vectorizer_tfidf = TfidfVectorizer()
X_tfidf = vectorizer_tfidf.fit_transform(train_documents)

In [None]:
test_vector_tfidf = vectorizer_tfidf.transform([test_document])
importance_of_chinese = test_vector_tfidf[0, vectorizer_tfidf.vocabulary_["chinese"]]

In [None]:
print("Importance of the word 'Chinese' in the test data (TF-IDF):", importance_of_chinese)

Importance of the word 'Chinese' in the test data (TF-IDF): 0.7420574954436144


### FIND THE SIMILARITY OF THE TEST DATA CONSIDERING WITH ANY ONE DOCUMENT FROM TRAINING DATA USING THE COSINE SIMILARITY EVALUATION METRIC

In [None]:
X_tfidf_dense = tf.convert_to_tensor(X_tfidf.toarray(), dtype=tf.float32)
test_vector_tfidf_dense = tf.convert_to_tensor(test_vector_tfidf.toarray(), dtype=tf.float32)

In [None]:
selected_document_tfidf = X_tfidf_dense[0]

In [None]:
dot_product = tf.reduce_sum(tf.multiply(test_vector_tfidf_dense, selected_document_tfidf), axis=1)
magnitude_test = tf.sqrt(tf.reduce_sum(tf.square(test_vector_tfidf_dense), axis=1))
magnitude_doc = tf.sqrt(tf.reduce_sum(tf.square(selected_document_tfidf)))

In [None]:
cosine_similarity = dot_product / (magnitude_test * magnitude_doc)
cosine_similarity_value = cosine_similarity.numpy()[0]

In [None]:
print("Cosine Similarity between test data and selected training document:", cosine_similarity_value)

Cosine Similarity between test data and selected training document: 0.5358071


### TAKE A DATASET OF YOUR OWN LABELLED WITH SENTIMENT. SPLIT THE TRAINING AND TESTING PART AND COMPUTE THE SENTIMENT CLASSIFICATION WITH THE APPLICATION OF LAPLACE SMOOTHING

In [None]:
import tensorflow_datasets as tfds
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional

In [None]:
dataset, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)
train_data, test_data = dataset['train'], dataset['test']

Downloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.YW4XHH_1.0.0/imdb_reviews-train.tfrecor…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.YW4XHH_1.0.0/imdb_reviews-test.tfrecord…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.YW4XHH_1.0.0/imdb_reviews-unsupervised.…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


In [None]:
train_sentences = []
train_labels = []
for sentence, label in tfds.as_numpy(train_data):
    train_sentences.append(str(sentence))
    train_labels.append(int(label))

In [None]:
test_sentences = []
test_labels = []
for sentence, label in tfds.as_numpy(test_data):
    test_sentences.append(str(sentence))
    test_labels.append(int(label))

In [None]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    train_sentences, train_labels, test_size=0.2, random_state=42
)

In [None]:
vocab_size = 10000
max_length = 100
oov_tok = "<OOV>"

In [None]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)

In [None]:
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post')
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')

In [None]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=16, input_length=max_length),
    Bidirectional(LSTM(32)),
    Dense(24, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.build(input_shape=(None, max_length))

In [None]:
model.summary()

In [None]:
history = model.fit(train_padded, np.array(train_labels), epochs=5, validation_data=(val_padded, np.array(val_labels)))

Epoch 1/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 89ms/step - accuracy: 0.5917 - loss: 0.6451 - val_accuracy: 0.7480 - val_loss: 0.5202
Epoch 2/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 75ms/step - accuracy: 0.8393 - loss: 0.4104 - val_accuracy: 0.7970 - val_loss: 0.4386
Epoch 3/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 71ms/step - accuracy: 0.8664 - loss: 0.3306 - val_accuracy: 0.8320 - val_loss: 0.3917
Epoch 4/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 71ms/step - accuracy: 0.9050 - loss: 0.2462 - val_accuracy: 0.8344 - val_loss: 0.3770
Epoch 5/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 70ms/step - accuracy: 0.9355 - loss: 0.1839 - val_accuracy: 0.8418 - val_loss: 0.4216


In [None]:
test_loss, test_acc = model.evaluate(test_padded, np.array(test_labels))
print("Test Accuracy:", test_acc)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 19ms/step - accuracy: 0.8361 - loss: 0.4306
Test Accuracy: 0.8347200155258179
