- https://keras.io/examples/nlp/text_classification_from_scratch/
- Transormer: https://arxiv.org/abs/1706.03762
- Deep LEarning Python: https://sourestdeeds.github.io/pdf/Deep%20Learning%20with%20Python.pdf
- https://github.com/fchollet/deep-learning-with-python-notebooks
- Dataset: https://ai.stanford.edu/~amaas/data/sentiment/

You are a software dev trying to setup a textclassification model using keras & tensorflow. You downloaded an example dataset from https://ai.stanford.edu/~amaas/data/sentiment and split it into training, test and validation data using this code: 

Let’s prepare a validation set by setting apart 20% of the training text files in a new directory, aclImdb/val:

In [None]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"

import keras
import tensorflow as tf
import numpy as np
from keras import layers

# (Takes about 40 minutes)

batch_size = 32
raw_train_ds = keras.utils.text_dataset_from_directory(
    "review_data/train",
    batch_size=batch_size,
    validation_split=0.2,
    subset="training",
    seed=1337,
)
raw_val_ds = keras.utils.text_dataset_from_directory(
    "review_data/train",
    batch_size=batch_size,
    validation_split=0.2,
    subset="validation",
    seed=1337,
)
raw_test_ds = keras.utils.text_dataset_from_directory(
    "review_data/test", batch_size=batch_size
)

print(f"Number of batches in raw_train_ds: {raw_train_ds.cardinality()}")
print(f"Number of batches in raw_val_ds: {raw_val_ds.cardinality()}")
print(f"Number of batches in raw_test_ds: {raw_test_ds.cardinality()}")

In [None]:
import os
import random
import shutil

# Directory paths
train_dir = "review_data/train"
val_dir = "review_data/val"
test_dir = "review_data/test"

# Delete 50% of the data in the train folder
train_pos_dir = os.path.join(train_dir, "pos")
train_neg_dir = os.path.join(train_dir, "neg")
train_pos_files = os.listdir(train_pos_dir)
train_neg_files = os.listdir(train_neg_dir)
random.shuffle(train_pos_files)
random.shuffle(train_neg_files)
num_train_pos_files = len(train_pos_files)
num_train_neg_files = len(train_neg_files)
num_files_to_delete = int(num_train_pos_files * 0.5)
for file in train_pos_files[:num_files_to_delete]:
   file_path = os.path.join(train_pos_dir, file)
   os.remove(file_path)
for file in train_neg_files[:num_files_to_delete]:
   file_path = os.path.join(train_neg_dir, file)
   os.remove(file_path)

# Delete 50% of the data in the val folder
val_pos_dir = os.path.join(val_dir, "pos")
val_neg_dir = os.path.join(val_dir, "neg")
val_pos_files = os.listdir(val_pos_dir)
val_neg_files = os.listdir(val_neg_dir)
random.shuffle(val_pos_files)
random.shuffle(val_neg_files)
num_val_pos_files = len(val_pos_files)
num_val_neg_files = len(val_neg_files)
num_files_to_delete = int(num_val_pos_files * 0.5)
for file in val_pos_files[:num_files_to_delete]:
   file_path = os.path.join(val_pos_dir, file)
   os.remove(file_path)
for file in val_neg_files[:num_files_to_delete]:
   file_path = os.path.join(val_neg_dir, file)
   os.remove(file_path)

# Delete 50% of the data in the test folder
test_pos_dir = os.path.join(test_dir, "pos")
test_neg_dir = os.path.join(test_dir, "neg")
test_pos_files = os.listdir(test_pos_dir)
test_neg_files = os.listdir(test_neg_dir)
random.shuffle(test_pos_files)
random.shuffle(test_neg_files)
num_test_pos_files = len(test_pos_files)
num_test_neg_files = len(test_neg_files)
num_files_to_delete = int(num_test_pos_files * 0.5)
for file in test_pos_files[:num_files_to_delete]:
   file_path = os.path.join(test_pos_dir, file)
   os.remove(file_path)
for file in test_neg_files[:num_files_to_delete]:
   file_path = os.path.join(test_neg_dir, file)
   os.remove(file_path)

In [None]:
import keras
import tensorflow as tf
import numpy as np
from keras import layers

# reduced time: 14 min

batch_size = 32
raw_train_ds = keras.utils.text_dataset_from_directory(
    "review_data/train",
    batch_size=batch_size,
    validation_split=0.2,
    subset="training",
    seed=1337,
)
raw_val_ds = keras.utils.text_dataset_from_directory(
    "review_data/train",
    batch_size=batch_size,
    validation_split=0.2,
    subset="validation",
    seed=1337,
)
raw_test_ds = keras.utils.text_dataset_from_directory(
    "review_data/test", batch_size=batch_size
)

print(f"Number of batches in raw_train_ds: {raw_train_ds.cardinality()}")
print(f"Number of batches in raw_val_ds: {raw_val_ds.cardinality()}")
print(f"Number of batches in raw_test_ds: {raw_test_ds.cardinality()}")

In [None]:
import string
import re


# Having looked at our data above, we see that the raw text contains HTML break
# tags of the form '<br />'. These tags will not be removed by the default
# standardizer (which doesn't strip HTML). Because of this, we will need to
# create a custom standardization function.
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, f"[{re.escape(string.punctuation)}]", ""
    )


# Model constants.
max_features = 20000
embedding_dim = 128
sequence_length = 500

# Now that we have our custom standardization, we can instantiate our text
# vectorization layer. We are using this layer to normalize, split, and map
# strings to integers, so we set our 'output_mode' to 'int'.
# Note that we're using the default split function,
# and the custom standardization defined above.
# We also set an explicit maximum sequence length, since the CNNs later in our
# model won't support ragged sequences.
vectorize_layer = keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

# Now that the vectorize_layer has been created, call `adapt` on a text-only
# dataset to create the vocabulary. You don't have to batch, but for very large
# datasets this means you're not keeping spare copies of the dataset in memory.

# Let's make a text-only dataset (no labels):
text_ds = raw_train_ds.map(lambda x, y: x)
# Let's call `adapt`:
vectorize_layer.adapt(text_ds)

In [None]:
# Option 2: Apply text vectorization layer to the text dataset to obtain a dataset of word indices, then feed it into a model that expects integer sequences as inputs.
# Enables  asynchronous CPU processing and buffering of your data when training on GPU

def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label


# Vectorize the data.
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

# Do async prefetching / buffering of the data for best performance on GPU.
train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)
test_ds = test_ds.cache().prefetch(buffer_size=10)

In [None]:
# Build model (1D covnet for now)

# A integer input for vocab indices.
inputs = keras.Input(shape=(None,), dtype="int64")

# Next, we add a layer to map those vocab indices into a space of dimensionality
# 'embedding_dim'.
x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)

# Conv1D + global max pooling
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

# We add a vanilla hidden layer:
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

# We project onto a single unit output layer, and squash it with a sigmoid:
predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

model = keras.Model(inputs, predictions)

# Compile the model with binary crossentropy loss and an adam optimizer.
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# train

epochs = 3

# Fit the model using the train and test datasets.
model.fit(train_ds, validation_data=val_ds, epochs=epochs)

In [None]:
model.evaluate(test_ds)

In [None]:
# saving this for later : 
import os
import json
import re

# Usage 
# Specify the path to the log folder and call the preprocess_log_files() function to start the preprocessing process.
# This code will iterate over all the JSON log files in the 'logs' folder, create the cropped logs for each file 
# and save them as separate files in the "preprocessed_logs" folder.

def search_pattern_in_files(folder_path, pattern, start_pattern, end_pattern):
  results = []
  for root, dirs, files in os.walk(folder_path):
      for file in files:
          file_path = os.path.join(root, file)
          with open(file_path, 'r') as f:
              content = f.read()
              matches = re.finditer(pattern, content)
              if matches:
                  for match in matches:
                      start = match.start()
                      end = match.end()
                      matched_section = content[start:end]
                      start_point = re.findall(start_pattern, content[:start])
                      end_point = re.findall(end_pattern, content[end:])
                      if start_point and end_point:
                          start_point = start_point[-1]
                          end_point = end_point[0]
                          before_text = content[content.rfind(start_point, 0, start):start]
                          after_text = content[end:end + content.find(end_point, end)]
                          result = {
                              "file_path": file_path,
                              "start": start,
                              "end": end,
                              "matched_section": matched_section,
                              "before_text": before_text,
                              "after_text": after_text
                          }
                          results.append(result)
                          break  # Stop searching after finding the first match

  return results

# Usage
folder_path = "logs"
pattern = r'"failed":\s*true'
start_pattern = r'"branch":\s*"master",\s*"index":'
end_pattern = r'"branch":\s*"master",\s*"index":'
results = search_pattern_in_files(folder_path, pattern, start_pattern, end_pattern)

# Example of accessing the saved match location and surrounding text
if results:
  match = results[0]  # Get the first match
  file_path = match["file_path"]
  start = match["start"]
  end = match["end"]
  matched_section = match["matched_section"]
  before_text = match["before_text"]
  after_text = match["after_text"]
  print(f"Match in file: {file_path}")
  print(f"Start position: {start}")
  print(f"End position: {end}")
  print(f"Matched section: {matched_section}")
  print(f"Text before match:\n{before_text}")
  print(f"Text after match: {after_text}")
  print()
else:
  print("No matches found in the first file.")