In [11]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.head(5)

Unnamed: 0,Label,Content
0,2,second counting input 5 2 which receives inter...
1,4,extremely low temperature of the chips in cold...
2,3,of the basic ammonium salt of the carboxyl ate...
3,9,18 u2033 is provided which is axially supporte...
4,2,to an u201c inner surface u201d means the surf...


In [4]:
data.Label.value_counts()

7    7143
8    7102
6    6887
2    6816
1    6756
5    6467
3    6059
9    5961
4    5928
Name: Label, dtype: int64

In [5]:
labels = np.array( ['(Human Necessities)',
 '(Performing Operations; Transporting)',
 '(Chemistry; Metallurgy)',
 '(Textiles; Paper)',
 '(Fixed Constructions)',
 '(Mechanical Engineering; Lightning; Heating; Weapons; Blasting)',
 '(Physics)',
 '(Electricity)', 
 '(General tagging of new or cross-sectional technology)'])

In [6]:
from sklearn.model_selection import train_test_split

# Use train_test_split to split training data into training and validation sets
train_sentences, val_sentences, train_labels, val_labels = train_test_split(data["Content"].to_numpy(),
                                                                            data["Label"].to_numpy(),
                                                                            test_size=0.1, # dedicate 10% of samples to validation set
                                                                            random_state=42) # random state for reproducibility

In [7]:
len(train_sentences), len(train_labels), len(val_sentences), len(val_labels)

(53207, 53207, 5912, 5912)

In [9]:
train_sentences[:2], train_labels[:2]

(array(['for a multiband operation or in another preferred embodiment the sfc curve such as 25 defines the perimeter of an aperture 33 on the patch 30 fig1 such an aperture contributes significantly to reduce the first resonant frequency of the patch with respect to the solid patch case which significantly contributes to reducing the antenna size said two configurations the sfc slot and the sfc aperture cases can of course be use also with sfc perimeter patch antennas as for instance the one 30 described in fig1 at this point it becomes clear to those skilled in the art what is the scope and spirit of the present invention and that the same sfc geometric principle can be applied in an innovative way to all the well known prior art configurations more examples are given in fig1 16 17 and 18 fig1 describes another preferred embodiment of an sfc antenna it consists on an aperture antenna said aperture being characterized by its sfc perimeter said aperture being impressed over a conducting

In [12]:
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

250

In [28]:
max_vocab_length = 40000 # max number of words to have in our vocabulary
max_length = 250 # max length our sequences will be (e.g. how many words from a Tweet does our model see?)


In [29]:
tf.random.set_seed(42)
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=max_vocab_length, # set input shape
                             output_dim=128, # set size of embedding vector
                             embeddings_initializer="uniform", # default, intialize randomly
                             input_length=max_length, # how long is each input
                             name="embedding_1") 

embedding

<keras.src.layers.core.embedding.Embedding at 0x28df7ed5c70>

In [30]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
# Note: in TensorFlow 2.6+, you no longer need "layers.experimental.preprocessing"
# you can use: "tf.keras.layers.TextVectorization", see https://github.com/tensorflow/tensorflow/releases/tag/v2.6.0 for more

# Use the default TextVectorization variables
text_vectorizer = TextVectorization(max_tokens=None, # how many words in the vocabulary (all of the different words in your text)
                                    standardize="lower_and_strip_punctuation", # how to process text
                                    split="whitespace", # how to split tokens
                                    ngrams=None, # create groups of n-words?
                                    output_mode="int", # how to map tokens to numbers
                                    output_sequence_length=None) # how long should the output sequence of tokens be?
                                    # pad_to_max_tokens=True) # Not valid if using max_tokens=None

In [31]:
text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [32]:
# Fit the text vectorizer to the training text
text_vectorizer.adapt(train_sentences)

In [33]:
import random
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
      \n\nEmbedded version:")

# Embed the random sentence (turn it into numerical representation)
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text:
counter electrode configuration seen for example in fig5 and 6 that is the product of the design of this invention is a photonic structure when the protruding electrode elements are a metal or metal coated tcm there can also result additional photon density modifications due to plasmonics and photonics effects representative tcms illustratively include indium tin oxide and tin oxy fluoride the structures of fig5 7 are usually termed superstrate solar cells in that they have the light entering the cell through the mechanical support layer a of these figures it is appreciated that an inventive device is operative as either a superstrate cell mechanical support as shown in the accompanying figures or through the substrate cell free surface table i provides some additional superstrate and substrate lccm devices based on employing arrays of unit cell absorber protrusions these may have various spacings l and the ramifications of this in terms of the truncation of the undulati

<tf.Tensor: shape=(1, 250, 128), dtype=float32, numpy=
array([[[-0.04376262,  0.0426993 ,  0.00167274, ...,  0.0314636 ,
          0.04202992,  0.00517861],
        [ 0.00989809,  0.04891184, -0.00797943, ..., -0.01777726,
         -0.02028114,  0.01786032],
        [ 0.0125933 , -0.02573659,  0.04071958, ...,  0.04252397,
         -0.01512875,  0.03700643],
        ...,
        [-0.00510817,  0.04110542, -0.01230104, ...,  0.00884434,
          0.03181611,  0.04931827],
        [-0.00510817,  0.04110542, -0.01230104, ...,  0.00884434,
          0.03181611,  0.04931827],
        [-0.00510817,  0.04110542, -0.01230104, ...,  0.00884434,
          0.03181611,  0.04931827]]], dtype=float32)>

In [34]:
sample_embed[0][0]

<tf.Tensor: shape=(128,), dtype=float32, numpy=
array([-0.04376262,  0.0426993 ,  0.00167274,  0.03169236, -0.00236156,
        0.0177213 , -0.02621112, -0.02638395, -0.01930866,  0.00377951,
       -0.03847653, -0.0264554 , -0.04328701,  0.01010121, -0.02111952,
        0.01449213,  0.02950026,  0.013651  , -0.04134249,  0.03763732,
       -0.00706149,  0.03954499,  0.04787112, -0.01855935,  0.00706304,
        0.01130903,  0.03988916, -0.02587284,  0.041762  , -0.04101966,
        0.02751484, -0.04101863,  0.04737658, -0.03562175, -0.04113491,
       -0.04723847, -0.0438901 , -0.03894484, -0.00257622,  0.0296664 ,
       -0.04703728,  0.03932122, -0.01822954,  0.0367218 , -0.00473329,
        0.01149585, -0.01204014, -0.0026757 , -0.0490379 ,  0.01741108,
       -0.02374138, -0.0344369 , -0.00132127, -0.00303932, -0.0276466 ,
       -0.03618324, -0.01684182, -0.02594539, -0.01679405, -0.02390523,
        0.04818306, -0.0097711 , -0.02153879,  0.01583968,  0.03801257,
        0.015723

More specifically, we'll be building the following:
* **Model 0**: Naive Bayes (baseline)
* **Model 1**: Feed-forward neural network (dense model)
* **Model 2**: LSTM model
* **Model 3**: GRU model
* **Model 4**: Bidirectional-LSTM model
* **Model 5**: 1D Convolutional Neural Network
* **Model 6**: TensorFlow Hub Pretrained Feature Extractor
* **Model 7**: Same as model 6 with 10% of training data

Model 0 is the simplest to acquire a baseline which we'll expect each other of the other deeper models to beat.

Each experiment will go through the following steps:
* Construct the model
* Train the model
* Make predictions with the model
* Track prediction evaluation metrics for later comparison

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create tokenization and modelling pipeline
model_0 = Pipeline([
                    ("tfidf", TfidfVectorizer()), # convert words to numbers using tfidf
                    ("clf", MultinomialNB()) # model the text
])

# Fit the pipeline to the training data
model_0.fit(train_sentences, train_labels)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [36]:
baseline_score = model_0.score(val_sentences, val_labels)
print(f"Our baseline model achieves an accuracy of: {baseline_score*100:.2f}%")

Our baseline model achieves an accuracy of: 51.91%


In [37]:
# Make predictions
baseline_preds = model_0.predict(val_sentences)
baseline_preds[:20]

array([3, 5, 5, 2, 2, 7, 1, 1, 8, 5, 8, 7, 2, 5, 3, 4, 8, 2, 7, 7],
      dtype=int64)

### Creating an evaluation function for our model experiments

We could evaluate these as they are but since we're going to be evaluating several models in the same way going forward, let's create a helper function which takes an array of predictions and ground truth labels and computes the following:
* Accuracy
* Precision
* Recall
* F1-score

> 🔑 **Note:** Since we're dealing with a classification problem, the above metrics are the most appropriate. If we were working with a regression problem, other metrics such as MAE (mean absolute error) would be a better choice.

In [38]:
# Function to evaluate: accuracy, precision, recall, f1-score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [39]:
# Get baseline results
baseline_results = calculate_results(y_true=val_labels,
                                     y_pred=baseline_preds)
baseline_results

{'accuracy': 51.911366711772665,
 'precision': 0.5435636745764952,
 'recall': 0.5191136671177267,
 'f1': 0.491364899712393}