#**TEXT CLASSIFICATION**
 - Text preprocessing – stopwords removal and porter stemming algorithm
 - Word embeddings using keras
 - Padding the sequence
 - Fix the vocabulary size
 - One hot encoding
 - Construct the sequential model RNN and LSTM
 - Evaluate with accuracy score


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import TextVectorization, Embedding, LSTM, Dense, Input
from tensorflow.keras import layers
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

### **TEXT PRE-PROCESSING - STOPWORDS REMOVAL AND PORTER STEMMING ALGORITHM**

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
stop_words = set(stopwords.words('english'))
porter = PorterStemmer()

In [None]:
def preprocess_text(text):
    tokens = text.split()
    tokens = [porter.stem(word) for word in tokens if word.lower() not in stop_words]
    return ' '.join(tokens)

In [None]:
!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py

--2024-08-17 17:15:52--  https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10246 (10K) [text/plain]
Saving to: ‘helper_functions.py’


2024-08-17 17:15:52 (78.4 MB/s) - ‘helper_functions.py’ saved [10246/10246]



In [None]:
from helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys
!wget "https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip"
unzip_data("nlp_getting_started.zip")
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

--2024-08-17 17:15:52--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.174.207, 74.125.23.207, 74.125.203.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.174.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip’


2024-08-17 17:15:53 (930 KB/s) - ‘nlp_getting_started.zip’ saved [607343/607343]



In [None]:
train_df_shuffled = train_df.sample(frac=1, random_state=42)

In [None]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    train_df_shuffled["text"].to_numpy(),
    train_df_shuffled["target"].to_numpy(),
    test_size=0.1,
    random_state=42)

In [None]:
train_sentences = [preprocess_text(sentence) for sentence in train_sentences]
val_sentences = [preprocess_text(sentence) for sentence in val_sentences]

### **WORD EMBEDDINGS USING KERAS**

In [None]:
max_vocab_length = 10000
max_length = 15
embedding_dim = 16

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_vocab_length, oov_token="<OOV>")
tokenizer.fit_on_texts(train_sentences)

In [None]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)

### **PADDING THE SEQUENCE**

In [None]:
padded_train_sequences = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
padded_val_sequences = pad_sequences(val_sequences, maxlen=max_length, padding='post', truncating='post')

###**CONSTRUCT THE SEQUENTAIL MODEL**

In [None]:
inputs = Input(shape=(max_length,))
x = Embedding(input_dim=max_vocab_length, output_dim=embedding_dim, input_length=max_length)(inputs)
x = LSTM(64)(x)
outputs = Dense(1, activation="sigmoid")(x)
model = tf.keras.Model(inputs, outputs, name="text_classification_model")



In [None]:
model.compile(loss="binary_crossentropy",
              optimizer=tf.keras.optimizers.Adam(),
              metrics=["accuracy"])

In [None]:
model.summary()

In [None]:
model_history = model.fit(padded_train_sequences,
                          train_labels,
                          epochs=5,
                          validation_data=(padded_val_sequences, val_labels),
                          callbacks=[create_tensorboard_callback("model_logs", "LSTM")])

Saving TensorBoard log files to: model_logs/LSTM/20240817-171601
Epoch 1/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 13ms/step - accuracy: 0.6696 - loss: 0.6036 - val_accuracy: 0.7703 - val_loss: 0.4686
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.8583 - loss: 0.3418 - val_accuracy: 0.7835 - val_loss: 0.4741
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.9134 - loss: 0.2295 - val_accuracy: 0.7743 - val_loss: 0.5186
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.9365 - loss: 0.1741 - val_accuracy: 0.7612 - val_loss: 0.6599
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.9529 - loss: 0.1415 - val_accuracy: 0.7690 - val_loss: 0.5995


In [None]:
model_pred_probs = model.predict(padded_val_sequences)
model_preds = tf.squeeze(tf.round(model_pred_probs))

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step


###**EVALUATE THE ACCURACY SCORE**

In [None]:
def calculate_results(y_true, y_pred):
    model_accuracy = accuracy_score(y_true, y_pred) * 100
    model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    model_results = {"accuracy": model_accuracy,
                     "precision": model_precision,
                     "recall": model_recall,
                     "f1": model_f1}
    return model_results

In [None]:
model_results = calculate_results(y_true=val_labels, y_pred=model_preds)
print(model_results)

{'accuracy': 76.9028871391076, 'precision': 0.7798518825968084, 'recall': 0.7690288713910761, 'f1': 0.7634085385385204}
