<a href="https://colab.research.google.com/github/Poorya0071/NLP_TensorFlow/blob/main/ag_news.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ag_news_subset

This notebook aims to work on AG dataset from Tensorflow.

In [75]:
import tensorflow_datasets.public_api as tfds

In [76]:
datasets_list = tfds.list_builders() # get all available datasets in TFDS
print("ag_news_subset" in datasets_list)

True


# Upload the dataset and split it to train and test

In [77]:
(train_data, test_data), ds_info = tfds.load(name="ag_news_subset", # target dataset to get from TFDS
                                             split=["train", "test"], # what splits of data should we get? note: not all datasets have train, valid, test
                                             shuffle_files=True, # shuffle files on download?
                                             as_supervised=True, # download data in tuple format (sample, label), e.g. (image, label)
                                             with_info=True) 

In [78]:
ds_info.features

FeaturesDict({
    'description': Text(shape=(), dtype=tf.string),
    'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=4),
    'title': Text(shape=(), dtype=tf.string),
})

In [79]:
ds_info.features['title']

Text(shape=(), dtype=tf.string)

In [80]:
class_names = ds_info.features["label"].names
class_names

['World', 'Sports', 'Business', 'Sci/Tech']

# take one sample to study the type and shape of data

In [81]:
train_one_sample = train_data.take(1)

In [82]:
train_data

<PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>

In [83]:
for text, label in train_one_sample:
  print(f"""
  Image shape: {text.shape}
  Image dtype: {text.dtype}
  Target class from Food101 (tensor form): {label}
  Class name (str form): {class_names[label.numpy()]}
        """)


  Image shape: ()
  Image dtype: <dtype: 'string'>
  Target class from Food101 (tensor form): 3
  Class name (str form): Sci/Tech
        


In [140]:
import pandas as pd

# Turn data into Pandas DataFrame

In [86]:
df = tfds.as_dataframe(train_data, ds_info)

In [87]:
data = df.copy()

In [88]:
df.head()

Unnamed: 0,description,label
0,b'AMD #39;s new dual-core Opteron chip is desi...,3
1,b'Reuters - Major League Baseball\\Monday anno...,1
2,b'President Bush #39;s quot;revenue-neutral q...,2
3,b'Britain will run out of leading scientists u...,3
4,"b'London, England (Sports Network) - England m...",1


In [89]:
df.info()

<class 'tensorflow_datasets.core.as_dataframe.StyledDataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   description  120000 non-null  object
 1   label        120000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.8+ MB


In [28]:
# import neattext as nt

# def preprocess_text(text):
#     text = text.lower()
#     text = nt.TextFrame(text)
#     text = (text.remove_emails().remove_urls().remove_emojis()
#             .remove_puncts().remove_stopwords().remove_special_characters()
# 						.fix_contractions())
#     return str(text)

# df["description"] = df["description"].apply(lambda x:preprocess_text(x))

 # Shuffle datasets

In [90]:
df_shuffled = df.sample(frac=1, random_state=42) # shuffle with random_state=42 for reproducibility

In [91]:
df_shuffled.head(10)

Unnamed: 0,description,label
71787,b' LOS ANGELES/PHILADELPHIA (Reuters) - Califo...,2
67218,b'Microsofthas joined the desktop search fray ...,3
54066,"b'Oil prices hit fresh records on Monday, push...",2
7168,b'AP - A man accused by the United States of b...,0
29618,b'Reports suggest an Iraqi woman scientist may...,0
101425,"b"" TEHRAN (Reuters) - Iran will stop convertin...",0
20441,b'Microsoft today announced that it has acquir...,3
2662,b' NEW YORK (Reuters) - U.S. stocks advanced o...,2
20371,"b'As regulatory audits loom, wise executives h...",2
108151,b'As the Olympic Games headed into the home st...,1


Indicate Features and Labels

In [92]:
X = df_shuffled['description']
y = df_shuffled['label']

In [117]:
X = X.tolist()

# Check the data Imbalance

In [93]:
y.value_counts()

2    30000
3    30000
0    30000
1    30000
Name: label, dtype: int64

# Split the train dataset to train and validation

In [118]:
from sklearn.model_selection import train_test_split

# Use train_test_split to split training data into training and validation sets
train_sentences, val_sentences, train_labels, val_labels = train_test_split(X,
                                                                            y.to_numpy(),
                                                                            test_size=0.2, # dedicate 10% of samples to validation set
                                                                            random_state=42)

In [99]:
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

31

# Text Vectorization

In [100]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
# Note: in TensorFlow 2.6+, you no longer need "layers.experimental.preprocessing"
# you can use: "tf.keras.layers.TextVectorization", see https://github.com/tensorflow/tensorflow/releases/tag/v2.6.0 for more

# Use the default TextVectorization variables
max_vocab_length = 60000 # max number of words to have in our vocabulary
max_length = 31 # max length our sequences will be (e.g. how many words from a Tweet does our model see?)

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [101]:
text_vectorizer.adapt(train_sentences)

In [102]:
sample_sentence = "There's a flood in my street!"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 31), dtype=int64, numpy=
array([[2613,    3, 4725,    6, 1196,  337,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0]])>

In [103]:
len(text_vectorizer.get_vocabulary())

60000

# Embedding

In [104]:
tf.random.set_seed(42)
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=max_vocab_length, # set input shape
                             output_dim=128, # set size of embedding vector
                             embeddings_initializer="uniform", # default, intialize randomly
                             input_length=max_length, # how long is each input
                             name="embedding_1") 

embedding

<keras.layers.core.embedding.Embedding at 0x7f527d654fa0>

In [105]:
import random
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
      \n\nEmbedded version:")

# Embed the random sentence (turn it into numerical representation)
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text:
b'Never mind cover boy Adrian Peterson, the quarterback with a Heisman at home, the stifling defense or the sideline genius of Bob Stoops.'      

Embedded version:


<tf.Tensor: shape=(1, 31, 128), dtype=float32, numpy=
array([[[-0.04823525,  0.02764073, -0.02922579, ..., -0.01928561,
         -0.00341996,  0.00573767],
        [-0.03136536, -0.04329263,  0.0493185 , ..., -0.03747846,
          0.03648457, -0.01672309],
        [ 0.01860459,  0.01337567,  0.02362109, ...,  0.03108766,
          0.03604201, -0.04792205],
        ...,
        [ 0.01645621, -0.00589932, -0.01471175, ..., -0.02511839,
          0.00912381, -0.00024097],
        [ 0.01645621, -0.00589932, -0.01471175, ..., -0.02511839,
          0.00912381, -0.00024097],
        [ 0.01645621, -0.00589932, -0.01471175, ..., -0.02511839,
          0.00912381, -0.00024097]]], dtype=float32)>

# Base model

In [106]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create tokenization and modelling pipeline
model_0 = Pipeline([
                    ("tfidf", TfidfVectorizer()), # convert words to numbers using tfidf
                    ("clf", MultinomialNB()) # model the text
])

# Fit the pipeline to the training data
model_0.fit(train_sentences, train_labels)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [107]:
baseline_score = model_0.score(val_sentences, val_labels)
print(f"Our baseline model achieves an accuracy of: {baseline_score*100:.2f}%")

Our baseline model achieves an accuracy of: 89.72%


# set The best performance callback

In [108]:
import os

# Create a function to implement a ModelCheckpoint callback with a specific filename 
def create_model_checkpoint(model_name, save_path="model_experiments"):
  return tf.keras.callbacks.ModelCheckpoint(filepath=os.path.join(save_path, model_name), # create filepath to save model
                                            verbose=0, # only output a limited amount of text
                                            save_best_only=True)

# Combination of LSTM and Dense model

In [111]:
# Set random seed and create embedding layer (new embedding layer for each model)
tf.random.set_seed(42)
from tensorflow.keras import layers
model_2_embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer="uniform",
                                     input_length=max_length,
                                     name="embedding_2")


# Create LSTM model
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_2_embedding(x)
print(x.shape)
x = layers.LSTM(16, return_sequences=True)(x) # return vector for each word in the Tweet (you can stack RNN cells as long as return_sequences=True
x = layers.LSTM(32, return_sequences=True)(x)
x = layers.LSTM(64, return_sequences=True)(x)
x = layers.LSTM(128)(x) # return vector for whole sequence
print(x.shape)
x = layers.Dense(256, activation="relu")(x) # optional dense layer on top of output of LSTM cell
outputs = layers.Dense(4, activation="softmax")(x)
model_2 = tf.keras.Model(inputs, outputs, name="model_2_LSTM")

# Compile model
model_2.compile(loss="sparse_categorical_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

# Fit model
model_2_history = model_2.fit(train_sentences, # input sentences can be a list of strings due to text preprocessing layer built-in model
                              train_labels,
                              steps_per_epoch=int(0.1 * len(train_sentences)),
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              validation_steps=int(0.1 * len(val_sentences)),
                              callbacks=[create_model_checkpoint(model_name=model_2.name)])

(None, 31, 128)
(None, 128)
Epoch 1/5



Epoch 2/5



Epoch 3/5
Epoch 4/5
Epoch 5/5


In [112]:
model_2 = tf.keras.models.load_model("model_experiments/model_2_LSTM")
model_2.evaluate(val_sentences, val_labels)



[0.2979399859905243, 0.9043333530426025]

# GRU model

In [59]:
tf.random.set_seed(42)
from tensorflow.keras import layers
model_3_embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer="uniform",
                                     input_length=max_length,
                                     name="embedding_3")

# Build an RNN using the GRU cell
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_3_embedding(x)
# x = layers.GRU(64, return_sequences=True) # stacking recurrent cells requires return_sequences=True
x = layers.GRU(64)(x) 
# x = layers.Dense(64, activation="relu")(x) # optional dense layer after GRU cell
outputs = layers.Dense(4, activation="softmax")(x)
model_3 = tf.keras.Model(inputs, outputs, name="model_3_GRU")

In [60]:
model_3.compile(loss="sparse_categorical_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

# Fit model
model_3_history = model_3.fit(train_sentences, # input sentences can be a list of strings due to text preprocessing layer built-in model
                              train_labels,
                              steps_per_epoch=int(0.1 * len(train_sentences)),
                              epochs=8,
                              validation_data=(val_sentences, val_labels),
                              validation_steps=int(0.1 * len(val_sentences)))

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


# Bidirectional Model

In [57]:
tf.random.set_seed(42)
from tensorflow.keras import layers
model_4_embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer="uniform",
                                     input_length=max_length,
                                     name="embedding_4")

# Build a Bidirectional RNN in TensorFlow
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_4_embedding(x)
# x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x) # stacking RNN layers requires return_sequences=True
x = layers.Bidirectional(layers.LSTM(64))(x) # bidirectional goes both ways so has double the parameters of a regular LSTM layer
outputs = layers.Dense(4, activation="softmax")(x)
model_4 = tf.keras.Model(inputs, outputs, name="model_4_Bidirectional")

In [58]:
model_4.compile(loss="sparse_categorical_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

# Fit model
model_4_history = model_4.fit(train_sentences, # input sentences can be a list of strings due to text preprocessing layer built-in model
                              train_labels,
                              steps_per_epoch=int(0.1 * len(train_sentences)),
                              epochs=8,
                              validation_data=(val_sentences, val_labels),
                              validation_steps=int(0.1 * len(val_sentences)))

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


# Prediction on some unseen samples

In [137]:
class_names

['World', 'Sports', 'Business', 'Sci/Tech']

In [138]:
sample = "These days everyone says that Lionel Messi is the best choice for the FIFA best player award after winning the 2022 Qatar world cup."
model_2.predict([sample])



array([[0.01363028, 0.9831715 , 0.00183328, 0.00136499]], dtype=float32)

In [139]:
sample_1 = 'BTC price is at a three year low, and experts predict that this trend will continue until the US government stops the interest rate hike.'
model_2.predict([sample_1])



array([[0.07106265, 0.00273783, 0.8624151 , 0.06378452]], dtype=float32)

In [136]:
sample_2 = "Machine learning is a branch of artificial intelligence (AI) and computer science which focuses on the use of data and algorithms to imitate the way that humans learn, gradually improving its accuracy."
model_2.predict([sample_2])



array([[1.0955307e-03, 7.5701530e-05, 6.7913998e-03, 9.9203730e-01]],
      dtype=float32)