In [6]:
#example on keras documentation https://keras.io/examples/nlp/bidirectional_lstm_imdb/
#dataset https://www.kaggle.com/datasets/thedevastator/new-dataset-for-text-classification-ag-news
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers

max_features = 20000  # Only consider the top 20k words
maxlen = 200  # Only consider the first 200 words of each news article review


In [7]:
#example on keras documentation https://keras.io/examples/nlp/bidirectional_lstm_imdb/
# Input for variable-length sequences of integers
inputs = keras.Input(shape=(None,), dtype="int32")
# Embed each integer in a 128-dimensional vector
x = layers.Embedding(max_features, 128)(inputs)
# Add 2 bidirectional LSTMs
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(64))(x)
# Add a classifier
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.summary()


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 128)         2560000   
                                                                 
 bidirectional (Bidirection  (None, None, 128)         98816     
 al)                                                             
                                                                 
 bidirectional_1 (Bidirecti  (None, 128)               98816     
 onal)                                                           
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 2757761 (10.52 MB)
Trainable params: 2757761 (

In [8]:
#check number of uniqie words in dataset to update vocab size later
import pandas as pd
data = pd.read_csv("train.csv", nrows = 1000)
data.head()
#data.info()

Unnamed: 0,text,label
0,Wall St. Bears Claw Back Into the Black (Reute...,2
1,Carlyle Looks Toward Commercial Aerospace (Reu...,2
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,2
3,Iraq Halts Oil Exports from Main Southern Pipe...,2
4,"Oil prices soar to all-time record, posing new...",2


In [9]:
#budilding model
from tensorflow import keras
from tensorflow.keras import layers

# vocab size
max_features = 20000

inputs = keras.Input(shape=(None,), dtype="int32")

x = layers.Embedding(max_features, 128)(inputs)

# Add 2 bidirectional LSTMs
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(64))(x)

num_classes = 4  # Number of classes
outputs = layers.Dense(num_classes, activation="softmax")(x)

model = keras.Model(inputs, outputs)
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_1 (Embedding)     (None, None, 128)         2560000   
                                                                 
 bidirectional_2 (Bidirecti  (None, None, 128)         98816     
 onal)                                                           
                                                                 
 bidirectional_3 (Bidirecti  (None, 128)               98816     
 onal)                                                           
                                                                 
 dense_1 (Dense)             (None, 4)                 516       
                                                                 
Total params: 2758148 (10.52 MB)
Trainable params: 2758148 

In [10]:
#used https://www.kaggle.com/code/ybrenning/simple-feature-extractor-bert-model#Loading-the-Dataset and labs from data analysis for help
import pandas as pd
data = pd.read_csv("train.csv", nrows = 1000) #could also direct to kaggle website and not enter the name directly
data.head()
#df.shape

Unnamed: 0,text,label
0,Wall St. Bears Claw Back Into the Black (Reute...,2
1,Carlyle Looks Toward Commercial Aerospace (Reu...,2
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,2
3,Iraq Halts Oil Exports from Main Southern Pipe...,2
4,"Oil prices soar to all-time record, posing new...",2


In [11]:
data.tail

<bound method NDFrame.tail of                                                   text  label
0    Wall St. Bears Claw Back Into the Black (Reute...      2
1    Carlyle Looks Toward Commercial Aerospace (Reu...      2
2    Oil and Economy Cloud Stocks' Outlook (Reuters...      2
3    Iraq Halts Oil Exports from Main Southern Pipe...      2
4    Oil prices soar to all-time record, posing new...      2
..                                                 ...    ...
995  U.S. Stocks Rebound as Oil Prices Ease  NEW YO...      2
996  Dollar Rises Vs Euro After Asset Data  NEW YOR...      2
997  Bikes Bring Internet to Indian Villagers (AP) ...      3
998  Celebrity Chefs Are Everywhere in Vegas By ADA...      3
999  Entertainment World Wary of Microsoft Technolo...      3

[1000 rows x 2 columns]>

In [12]:
data.head()

Unnamed: 0,text,label
0,Wall St. Bears Claw Back Into the Black (Reute...,2
1,Carlyle Looks Toward Commercial Aerospace (Reu...,2
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,2
3,Iraq Halts Oil Exports from Main Southern Pipe...,2
4,"Oil prices soar to all-time record, posing new...",2


In [5]:
data.shape

(1000, 2)

In [13]:
data.dtypes

text     object
label     int64
dtype: object

In [14]:
#load data

from tensorflow.keras.preprocessing.text import Tokenizer

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
#getting text and label using pandas
texts = data['text']
labels = data['label']

#Tokenize
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

#padding sequences to a fixed length

maxlen = 200
X = pad_sequences(sequences, maxlen=maxlen)

Y = labels  #label encoding

#split your data into training and validation sets
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=42)

print(len(x_train), "Training sequences")
print(len(x_val), "Validation sequences")


800 Training sequences
200 Validation sequences


In [15]:
#compile model with metrics
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

#training model on data
model.fit(x_train, y_train, batch_size=32, epochs=10, validation_data=(x_val, y_val))
loss, accuracy = model.evaluate(x_val, y_val)
print(f'Loss: {loss}, Accuracy: {accuracy * 100:.2f}%')


Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss: 0.7043758630752563, Accuracy: 79.50%
