In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.head(5)

Unnamed: 0,Label,Content
0,2,second counting input 5 2 which receives inter...
1,4,extremely low temperature of the chips in cold...
2,3,of the basic ammonium salt of the carboxyl ate...
3,9,18 u2033 is provided which is axially supporte...
4,2,to an u201c inner surface u201d means the surf...


In [4]:
data.Label.value_counts()

7    7143
8    7102
6    6887
2    6816
1    6756
5    6467
3    6059
9    5961
4    5928
Name: Label, dtype: int64

In [5]:
labels = np.array( ['(Human Necessities)',
 '(Performing Operations; Transporting)',
 '(Chemistry; Metallurgy)',
 '(Textiles; Paper)',
 '(Fixed Constructions)',
 '(Mechanical Engineering; Lightning; Heating; Weapons; Blasting)',
 '(Physics)',
 '(Electricity)', 
 '(General tagging of new or cross-sectional technology)'])

In [6]:
from sklearn.model_selection import train_test_split

# Use train_test_split to split training data into training and validation sets
train_sentences, val_sentences, train_labels, val_labels = train_test_split(data["Content"].to_numpy(),
                                                                            data["Label"].to_numpy(),
                                                                            test_size=0.1, # dedicate 10% of samples to validation set
                                                                            random_state=42) # random state for reproducibility

In [7]:
len(train_sentences), len(train_labels), len(val_sentences), len(val_labels)

(53207, 53207, 5912, 5912)

In [8]:
train_sentences[:2], train_labels[:2]

(array(['for a multiband operation or in another preferred embodiment the sfc curve such as 25 defines the perimeter of an aperture 33 on the patch 30 fig1 such an aperture contributes significantly to reduce the first resonant frequency of the patch with respect to the solid patch case which significantly contributes to reducing the antenna size said two configurations the sfc slot and the sfc aperture cases can of course be use also with sfc perimeter patch antennas as for instance the one 30 described in fig1 at this point it becomes clear to those skilled in the art what is the scope and spirit of the present invention and that the same sfc geometric principle can be applied in an innovative way to all the well known prior art configurations more examples are given in fig1 16 17 and 18 fig1 describes another preferred embodiment of an sfc antenna it consists on an aperture antenna said aperture being characterized by its sfc perimeter said aperture being impressed over a conducting

In [9]:
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

250

In [10]:
max_vocab_length = 40000 # max number of words to have in our vocabulary
max_length = 250 # max length our sequences will be (e.g. how many words from a Tweet does our model see?)


In [11]:
tf.random.set_seed(42)
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=max_vocab_length, # set input shape
                             output_dim=128, # set size of embedding vector
                             embeddings_initializer="uniform", # default, intialize randomly
                             input_length=max_length, # how long is each input
                             name="embedding_1") 

embedding

<keras.src.layers.core.embedding.Embedding at 0x1c2d034d3d0>

In [12]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
# Note: in TensorFlow 2.6+, you no longer need "layers.experimental.preprocessing"
# you can use: "tf.keras.layers.TextVectorization", see https://github.com/tensorflow/tensorflow/releases/tag/v2.6.0 for more

# Use the default TextVectorization variables
text_vectorizer = TextVectorization(max_tokens=None, # how many words in the vocabulary (all of the different words in your text)
                                    standardize="lower_and_strip_punctuation", # how to process text
                                    split="whitespace", # how to split tokens
                                    ngrams=None, # create groups of n-words?
                                    output_mode="int", # how to map tokens to numbers
                                    output_sequence_length=None) # how long should the output sequence of tokens be?
                                    # pad_to_max_tokens=True) # Not valid if using max_tokens=None

In [13]:
text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [14]:
# Fit the text vectorizer to the training text
text_vectorizer.adapt(train_sentences)

In [15]:
import random
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
      \n\nEmbedded version:")

# Embed the random sentence (turn it into numerical representation)
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text:
160 may be adapted to emit gamma rays x rays and or beta electrons i e radiation having an energy of at least 10 kev for some applications the radiation source 166 666 may comprise a radioisotope or a miniature radiation generator in some aspects of the disclosure radiation source 166 666 may comprise a miniature x ray generator such as those described in one or more of the following references u s pat nos 6 134 300 and 6 353 658 to trebes et al haga a et al u201c a miniature x ray tube u201d applied physics letters 84 12 2208 2210 2004 and gutman g et al u201c a novel needle based miniature x ray generating system u201d phys med biol 49 4677 4688 2004 such a miniature x ray generator or x ray tube may be used for radiation source 160 instead of a radioisotope to illuminate the colon contents with x ray photons turning such a generator on and off as needed typically reduces exposure of the subject to radiation in addition the energy range can be better controlled and the

<tf.Tensor: shape=(1, 250, 128), dtype=float32, numpy=
array([[[ 0.01806576,  0.02335675, -0.03543026, ...,  0.02876968,
         -0.01658437, -0.0295394 ],
        [-0.03650651,  0.00482059, -0.02782421, ...,  0.02590832,
          0.03077591,  0.03695971],
        [ 0.01195394,  0.04461259, -0.03121606, ..., -0.02294071,
         -0.00552863, -0.00550319],
        ...,
        [-0.04878985, -0.01759648,  0.00754996, ...,  0.02724595,
         -0.01447652,  0.01962836],
        [-0.04878985, -0.01759648,  0.00754996, ...,  0.02724595,
         -0.01447652,  0.01962836],
        [-0.04878985, -0.01759648,  0.00754996, ...,  0.02724595,
         -0.01447652,  0.01962836]]], dtype=float32)>

In [16]:
sample_embed[0][0]

<tf.Tensor: shape=(128,), dtype=float32, numpy=
array([ 0.01806576,  0.02335675, -0.03543026,  0.00855614, -0.01487461,
       -0.03346654, -0.02609272,  0.01962258, -0.00047801, -0.03228524,
        0.0408424 ,  0.04085902, -0.01227093,  0.01272079,  0.04537104,
        0.04482205, -0.02632936, -0.01151383,  0.03194325,  0.01486267,
       -0.01340358,  0.0010277 , -0.03295763,  0.04432202,  0.03986535,
       -0.01512183,  0.01447237,  0.00621806,  0.00500212,  0.04285229,
       -0.03991324,  0.02679708,  0.04437294,  0.02314297, -0.04993096,
        0.0444879 ,  0.01923313,  0.0272801 , -0.03394802,  0.03073938,
        0.03294093, -0.0441174 , -0.00696136,  0.0152172 , -0.03588456,
       -0.00949164,  0.04743078,  0.02619446, -0.01028751, -0.03720381,
       -0.01398472,  0.03400728,  0.01353718,  0.0476563 , -0.01170985,
        0.01044494,  0.00251566, -0.00689371, -0.04853014, -0.03076999,
       -0.02821724, -0.01975433,  0.0031889 , -0.01764878,  0.04714047,
       -0.031694

More specifically, we'll be building the following:
* **Model 0**: Naive Bayes (baseline)
* **Model 1**: Feed-forward neural network (dense model)
* **Model 2**: LSTM model
* **Model 3**: GRU model
* **Model 4**: Bidirectional-LSTM model
* **Model 5**: 1D Convolutional Neural Network
* **Model 6**: TensorFlow Hub Pretrained Feature Extractor
* **Model 7**: Same as model 6 with 10% of training data

Model 0 is the simplest to acquire a baseline which we'll expect each other of the other deeper models to beat.

Each experiment will go through the following steps:
* Construct the model
* Train the model
* Make predictions with the model
* Track prediction evaluation metrics for later comparison

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create tokenization and modelling pipeline
model_0 = Pipeline([
                    ("tfidf", TfidfVectorizer()), # convert words to numbers using tfidf
                    ("clf", MultinomialNB()) # model the text
])

# Fit the pipeline to the training data
model_0.fit(train_sentences, train_labels)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [18]:
baseline_score = model_0.score(val_sentences, val_labels)
print(f"Our baseline model achieves an accuracy of: {baseline_score*100:.2f}%")

Our baseline model achieves an accuracy of: 51.91%


In [19]:
# Make predictions
baseline_preds = model_0.predict(val_sentences)
baseline_preds[:20]

array([3, 5, 5, 2, 2, 7, 1, 1, 8, 5, 8, 7, 2, 5, 3, 4, 8, 2, 7, 7],
      dtype=int64)

### Creating an evaluation function for our model experiments

We could evaluate these as they are but since we're going to be evaluating several models in the same way going forward, let's create a helper function which takes an array of predictions and ground truth labels and computes the following:
* Accuracy
* Precision
* Recall
* F1-score

> 🔑 **Note:** Since we're dealing with a classification problem, the above metrics are the most appropriate. If we were working with a regression problem, other metrics such as MAE (mean absolute error) would be a better choice.

In [20]:
# Function to evaluate: accuracy, precision, recall, f1-score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [21]:
# Get baseline results
baseline_results = calculate_results(y_true=val_labels,
                                     y_pred=baseline_preds)
baseline_results

{'accuracy': 51.911366711772665,
 'precision': 0.5435636745764952,
 'recall': 0.5191136671177267,
 'f1': 0.491364899712393}

In [22]:
import tensorflow as tf

In [23]:
import datetime

def create_tensorboard_callback(dir_name, experiment_name):
  """
  Creates a TensorBoard callback instand to store log files.

  Stores log files with the filepath:
    "dir_name/experiment_name/current_datetime/"

  Args:
    dir_name: target directory to store TensorBoard log files
    experiment_name: name of experiment directory (e.g. efficientnet_model_1)
  """
  log_dir = dir_name + "/" + experiment_name + "/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
  tensorboard_callback = tf.keras.callbacks.TensorBoard(
      log_dir=log_dir
  )
  print(f"Saving TensorBoard log files to: {log_dir}")
  return tensorboard_callback

In [24]:
# Create directory to save TensorBoard logs
SAVE_DIR = "model_logs"

Machinehack competitions download - Data Science Student Championship 2024
Data Dictionary:

Abstract (59119 rows): A summary of the patent.

Label (9 classes): The patent classification according to the European Patent Office (EPO) classification scheme.

Categories:

1 (Human Necessities),
2 (Performing Operations; Transporting),
3 (Chemistry; Metallurgy),
4 (Textiles; Paper),
5 (Fixed Constructions),
6 (Mechanical Engineering; Lightning; Heating; Weapons; Blasting),
7 (Physics),
8 (Electricity), and
9 (General tagging of new or cross-sectional technology)

In [34]:
# Build model with the Functional API
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string") # inputs are 1-dimensional strings
x = text_vectorizer(inputs) # turn the input text into numbers
x = embedding(x) # create an embedding of the numerized numbers
x = layers.GlobalAveragePooling1D()(x) # lower the dimensionality of the embedding (try running the model without this layer and see what happens)
outputs = layers.Dense(10, activation="softmax")(x) # create the output layer, want more than 2 outputs so use softmax activation
model_1 = tf.keras.Model(inputs, outputs, name="model_1_dense") # construct the model

In [35]:
model_1.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [36]:
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (Text  (None, 250)               0         
 Vectorization)                                                  
                                                                 
 embedding_1 (Embedding)     (None, 250, 128)          5120000   
                                                                 
 global_average_pooling1d_2  (None, 128)               0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_2 (Dense)             (None, 10)                1290      
                                                                 
Total params: 5121290 (19.54 MB)
Trainable params: 51

In [37]:
# Fit the model
model_1_history = model_1.fit(train_sentences, # input sentences can be a list of strings due to text preprocessing layer built-in model
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(dir_name=SAVE_DIR, 
                                                                     experiment_name="simple_dense_model")])

Saving TensorBoard log files to: model_logs/simple_dense_model/20240331-033226
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [38]:
# Check the results
model_1.evaluate(val_sentences, val_labels)



[1.2815651893615723, 0.5652909278869629]

In [44]:
# Make predictions (these come back in the form of probabilities)
model_1_pred_probs = model_1.predict(val_sentences)
# Turn prediction probabilities into single-dimension tensor of floats
model_1_preds = model_1_pred_probs.argmax(axis=1) # squeeze removes single dimensions
model_1_preds[:20]



array([3, 5, 5, 4, 4, 7, 1, 1, 8, 5, 8, 7, 2, 1, 3, 4, 8, 2, 7, 7],
      dtype=int64)

In [45]:
# Calculate model_1 metrics
model_1_results = calculate_results(y_true=val_labels, 
                                    y_pred=model_1_preds)
model_1_results

{'accuracy': 56.529093369418135,
 'precision': 0.5437580537489468,
 'recall': 0.5652909336941814,
 'f1': 0.5505000571342245}

In [46]:
# Create a helper function to compare our baseline results to new model results
def compare_baseline_to_new_results(baseline_results, new_model_results):
  for key, value in baseline_results.items():
    print(f"Baseline {key}: {value:.2f}, New {key}: {new_model_results[key]:.2f}, Difference: {new_model_results[key]-value:.2f}")

compare_baseline_to_new_results(baseline_results=baseline_results, 
                                new_model_results=model_1_results)

Baseline accuracy: 51.91, New accuracy: 56.53, Difference: 4.62
Baseline precision: 0.54, New precision: 0.54, Difference: 0.00
Baseline recall: 0.52, New recall: 0.57, Difference: 0.05
Baseline f1: 0.49, New f1: 0.55, Difference: 0.06
